Decision Tree Project

Pavan Kishore Kota Subramanya, ID: 2021AIML524

Just follow the directions below.

Build a classification model with Decision Trees. The main objective is to use the two splitting criteria of Gini index and Gain ratio and and observe the performance of the decision tree on the given data set. It is a real dataset about the students' knowledge status about the subject of Electrical DC Machines.

1.0 Environment setup

In [1]:
# Convert jupyter notebook into full screen
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Reset all variables and objects in notebook
%reset -f 

2.0 Confirming the working directory

In [1125]:
from os import chdir, getcwd
wd = getcwd()
wd
Out[1125]:
'C:\\Users\\Pavan Kota\\Python\\Jupyter Notebooks'

3.0 Import Libraries

Import pandas,seaborn, and the usual libraries.

In [1126]:
#!pip install chefboost
In [1127]:
%%html
<style>
img {align:left}
</style>
In [1128]:
from chefboost import Chefboost as chef

# for loading dataset
from dataprep.datasets import load_dataset
# importing function from DataPrep.eda
from dataprep.eda import create_report

import dalex as dx 

from datetime import datetime

from genetic_selection import GeneticSelectionCV
import graphviz

#Import data from your laptop's local folder
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler 
import io
from IPython.display import display
import itertools

import joblib
# os.sys.modules['sklearn.externals.joblib'] = joblib

from tensorflow import keras

from keras_tuner import RandomSearch
from keras.models import Sequential
from keras.layers import Dense,Dropout

from lightgbm import LGBMRegressor

import matplotlib.dates
import matplotlib.pyplot as plt
%matplotlib inline

import missingno as msno

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from mlxtend.plotting import plot_confusion_matrix

import numpy as np
from numpy.random import randn
from numpy.random import seed

from operator import sub
import os

import pandas as pd
import pandas_profiling

plt.rcParams["font.family"] = 'DejaVu Sans'

import re

from rfpimp import permutation_importances

from scipy import stats
from scipy.stats import boxcox
from scipy.stats import pearsonr
from scipy.stats import skew

import seaborn as sns

import sklearn
from sklearn import metrics
from sklearn import preprocessing #for scaling and pre-processing data
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble._forest import ForestClassifier, ForestRegressor
from sklearn.experimental import enable_iterative_imputer #enable th experimental feature of interative imputer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.impute import SimpleImputer # used for handling missing data
from sklearn.impute import IterativeImputer #the IterativeImputer in scikit-learn (view documentation) utilizes the data 
                                            #available in other features in order to estimate the missing values being
                                            #imputed
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.manifold import TSNE
from sklearn import metrics
from sklearn import linear_model
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, plot_confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss, make_scorer
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split # used for splitting training and testing data
from sklearn.model_selection import train_test_split #used for splitting data into training data and testing data
from sklearn.naive_bayes import GaussianNB 
from sklearn.naive_bayes import MultinomialNB 
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder # used for encoding categorical data
from sklearn.preprocessing import LabelEncoder # used for encoding categorical data
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import power_transform
from sklearn.preprocessing import StandardScaler # used for feature scaling
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn import svm
from sklearn.svm import SVC 

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.eval_measures import rmse


from IPython.core.interactiveshell import InteractiveShell # display complete output cell

import time

import warnings
warnings.filterwarnings('ignore')

4.0 Q-1: Load the dataset and print the metadata in the notebook. - 1Mark

In [1129]:
inputData = pd.read_csv("04.00.00 Predict_student_ knowledge_level.csv")
In [1130]:
inputData.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403 entries, 0 to 402
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   STG         403 non-null    float64
 1   SCG         403 non-null    float64
 2   STR         403 non-null    float64
 3   LPR         403 non-null    float64
 4   PEG         403 non-null    float64
 5    UNS        403 non-null    object 
 6   Unnamed: 6  0 non-null      float64
 7   Unnamed: 7  0 non-null      float64
 8   Unnamed: 8  0 non-null      float64
dtypes: float64(8), object(1)
memory usage: 28.5+ KB

Columns 6, 7 and 8 seem to be empty with 0 non-null values, hence we will drop these three columns. After checking the CSV opened in excel, we confirmed that the these columns are null and have no values in them

In [1131]:
inputData.head(10)
Out[1131]:
STG SCG STR LPR PEG UNS Unnamed: 6 Unnamed: 7 Unnamed: 8
0 0.00 0.00 0.00 0.00 0.00 very_low NaN NaN NaN
1 0.08 0.08 0.10 0.24 0.90 High NaN NaN NaN
2 0.06 0.06 0.05 0.25 0.33 Low NaN NaN NaN
3 0.10 0.10 0.15 0.65 0.30 Middle NaN NaN NaN
4 0.08 0.08 0.08 0.98 0.24 Low NaN NaN NaN
5 0.09 0.15 0.40 0.10 0.66 Middle NaN NaN NaN
6 0.10 0.10 0.43 0.29 0.56 Middle NaN NaN NaN
7 0.15 0.02 0.34 0.40 0.01 very_low NaN NaN NaN
8 0.20 0.14 0.35 0.72 0.25 Low NaN NaN NaN
9 0.00 0.00 0.50 0.20 0.85 High NaN NaN NaN

5.0 Q-2: Print a heatmap to check NULL values and Correlation values. - 1Mark

5.1 Check for null values via a heatmap

In [1132]:
sns.heatmap(inputData.isnull(), cbar=False)
Out[1132]:
<AxesSubplot:>
In [1133]:
msno.matrix(inputData)
Out[1133]:
<AxesSubplot:>
In [1134]:
#Find number of missing cells in descending order
inputData.isnull().sum().sort_values(ascending=False)
Out[1134]:
Unnamed: 6    403
Unnamed: 7    403
Unnamed: 8    403
STG             0
SCG             0
STR             0
LPR             0
PEG             0
 UNS            0
dtype: int64
In [1135]:
#Find % of missing values in descending order
inputData.isnull().sum().sort_values(ascending=False)/len(inputData)
Out[1135]:
Unnamed: 6    1.0
Unnamed: 7    1.0
Unnamed: 8    1.0
STG           0.0
SCG           0.0
STR           0.0
LPR           0.0
PEG           0.0
 UNS          0.0
dtype: float64

Conclusion: There are null values in the last 3 columns of the dataframe and hence these columns can be dropped

In [1136]:
inputData.drop(inputData.columns[[6, 7, 8]], axis=1, inplace=True)
In [1137]:
inputData.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403 entries, 0 to 402
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   STG     403 non-null    float64
 1   SCG     403 non-null    float64
 2   STR     403 non-null    float64
 3   LPR     403 non-null    float64
 4   PEG     403 non-null    float64
 5    UNS    403 non-null    object 
dtypes: float64(5), object(1)
memory usage: 19.0+ KB

5.2 Check for duplicate rows

In [1138]:
# Selecting duplicate rows except first occurrence based on all columns
duplicateRows = inputData[inputData.duplicated()]
print("Duplicate Rows")
duplicateRows
Duplicate Rows
Out[1138]:
STG SCG STR LPR PEG UNS

Conclusion: There are no duplicate rows in the Data Frame

5.3 Check for duplicate columns

In [1139]:
# Selecting duplicate columns except first occurrence based on all rows
duplicateColumns = inputData.loc[:,inputData.apply(lambda x: x.duplicated(),axis=1).all()].copy()
print("Duplicate Columns")
duplicateColumns
Duplicate Columns
Out[1139]:
0
1
2
3
4
...
398
399
400
401
402

403 rows × 0 columns

In [1140]:
def getDuplicateColumns(df):
    '''
    Get a list of duplicate columns.
    It will iterate over all the columns in dataframe and find the columns whose contents are duplicate.
    :param df: Dataframe object
    :return: List of columns whose contents are duplicates.
    '''
    duplicateColumnNames = set()
    # Iterate over all the columns in dataframe
    for x in range(df.shape[1]):
        # Select column at xth index.
        col = df.iloc[:, x]
        # Iterate over all the columns in DataFrame from (x+1)th index till end
        for y in range(x + 1, df.shape[1]):
            # Select column at yth index.
            otherCol = df.iloc[:, y]
            # Check if two columns at x 7 y index are equal
            if col.equals(otherCol):
                duplicateColumnNames.add(df.columns.values[y])
    return list(duplicateColumnNames)

duplicateColumnNames = getDuplicateColumns(inputData)

#Print Duplicate Columns
duplicateColumnNames
Out[1140]:
[]

Conclusion: There are no duplicate columns in the Data Frame

5.4 Find Columns with Constant Data

In [1141]:
# Function to return the constant value columns of a given DataFrame
def remove_constant_value_features(df):
    return [e for e in df.columns if df[e].nunique() == 1]

drop_col = remove_constant_value_features(inputData)
drop_col

#inputData.columns[inputData1.nunique() <= 1]
Out[1141]:
[]

Conclusion: There are no columns with constant data

5.5 Find Cells with Special Characters in Data Frame

In [1142]:
#Check if all but the last columns are float columns or integer columns. If they have special characters they will not be float type or integer type
inputData.dtypes
Out[1142]:
STG     float64
SCG     float64
STR     float64
LPR     float64
PEG     float64
 UNS     object
dtype: object

Conclusion: There are no special characters found

5.6 Find Cells with Special Characters in Data Frame Column Names

UNS column name seems to have extra space before it which needs to be removed

In [1143]:
# remove spaces in columns name
inputData.columns = inputData.columns.str.replace(' ','')

#Check if all but the last columns are float columns or integer columns. If they have special characters they will not be float type or integer type
inputData.dtypes
Out[1143]:
STG    float64
SCG    float64
STR    float64
LPR    float64
PEG    float64
UNS     object
dtype: object

Conclusion: The white space before UNS has been removed

5.7 Check for correlation values via a heatmap

In [1144]:
dataplot = sns.heatmap(inputData.corr(), cmap="YlGnBu", annot=True)
In [1145]:
inputData.corr()
Out[1145]:
STG SCG STR LPR PEG
STG 1.000000 0.049023 -0.051889 0.113957 0.198629
SCG 0.049023 1.000000 0.121235 0.119716 0.193566
STR -0.051889 0.121235 1.000000 0.083423 0.148338
LPR 0.113957 0.119716 0.083423 1.000000 -0.039283
PEG 0.198629 0.193566 0.148338 -0.039283 1.000000

Conclusion: There are no major correlations between 'X' values

5.8 Check for multi-collinearity

In [1146]:
#Copy Dataframe into a new variable
CopiedData=inputData[["STG", "SCG", "STR", "LPR", "PEG"]].copy(deep=True)

def calc_vif(X):
    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return(vif)
#ensure you don't select Rating column
X = CopiedData.loc[:, ~CopiedData.columns.isin(['Rating'])]
calc_vif(X)
Out[1146]:
variables VIF
0 STG 3.467076
1 SCG 3.634657
2 STR 3.780716
3 LPR 3.419269
4 PEG 3.994313
  • VIF starts at 0 and has no upper limit
  • VIF = 1, no correlation between the independent variable and the other variables
  • 1 < VIF < 5, features are moderately correlated
  • 5 < VIF < 10 or more, indicates high multicollinearity between this independent variable and the others
  • VIF > 10, means high correlation between features and is cause for concern

We see that PEG has the highest VIF of 3.994313, but since all VIF values are less than 5 we don't need to remove any features

To understand correlation between independent variables better, we will create a pair plot

In [1147]:
sns.pairplot(CopiedData)
Out[1147]:
<seaborn.axisgrid.PairGrid at 0x20913e822b0>

Conclusion: Since all VIF values are less than 5 we don't need to remove any features

5.9 Check for Data Imbalance

In [1148]:
plt.title('No of Datapoints per UNS',fontsize=20)
sns.countplot(inputData.UNS)
plt.xticks(rotation=75)
plt.show()

'Very Low' and 'very_low' are the same category being represented in two different ways, which needs to be corrected. We will convert 'very_low' to 'Very Low'

In [1149]:
inputData["UNS"] = inputData["UNS"].replace('very_low','Very Low')
In [1150]:
plt.title('No of Datapoints per UNS',fontsize=20)
sns.countplot(inputData.UNS)
plt.xticks(rotation=75)
plt.show()

Conclusion: After merging of 'Very Low' as well, there is a class imbalance. This can be corrected using Synthetic Minority Oversampling Technique (SMOTE).

5.10 Check for Outliers

In [1151]:
# Plot outliers in box plot for X variables
# boxplot = inputData.boxplot(column=inputData.columns['Distance', 'Cost','Discount','Delivery charges','Surge charges','Packaging charges','ST','Tip'])
dfBoxPlot=inputData.select_dtypes(include='number')
dfBoxPlot.info()
boxplot = dfBoxPlot.boxplot(column=dfBoxPlot.columns.values.tolist())#[ :-1]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403 entries, 0 to 402
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   STG     403 non-null    float64
 1   SCG     403 non-null    float64
 2   STR     403 non-null    float64
 3   LPR     403 non-null    float64
 4   PEG     403 non-null    float64
dtypes: float64(5)
memory usage: 15.9 KB

Conclusion: There are outliers in the STG feature which need to be removed

In [1152]:
for col in dfBoxPlot:
    plt.figure()
    dfBoxPlot.boxplot(column=[col])
In [1153]:
#remove outliers using IQR Range basis guidance from NIST
#https://www.itl.nist.gov/div898/handbook/prc/section1/prc16.htm

inputDataPlay=inputData.copy(deep=True)
inputDataPlay.info(verbose=True,memory_usage='deep',show_counts=True)

col_name = inputDataPlay.select_dtypes(include='number').columns.values.tolist()

Q1 = inputDataPlay[col_name].quantile(0.25)
Q3 = inputDataPlay[col_name].quantile(0.75)
IQR = Q3 - Q1
lower_boundary = Q1 - (1.5 * IQR)
upper_boundary = Q3 + (1.5 * IQR)

for column in col_name:
    inputDataPlay = inputDataPlay[inputDataPlay[column] >= lower_boundary[column]]
    inputDataPlay = inputDataPlay[inputDataPlay[column] <= upper_boundary[column]]

#print(df)
inputDataPlay.info(verbose=True,memory_usage='deep',show_counts=True)
inputDataPlay.describe(include='all')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403 entries, 0 to 402
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   STG     403 non-null    float64
 1   SCG     403 non-null    float64
 2   STR     403 non-null    float64
 3   LPR     403 non-null    float64
 4   PEG     403 non-null    float64
 5   UNS     403 non-null    object 
dtypes: float64(5), object(1)
memory usage: 40.2 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 398 entries, 0 to 402
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   STG     398 non-null    float64
 1   SCG     398 non-null    float64
 2   STR     398 non-null    float64
 3   LPR     398 non-null    float64
 4   PEG     398 non-null    float64
 5   UNS     398 non-null    object 
dtypes: float64(5), object(1)
memory usage: 42.7 KB
Out[1153]:
STG SCG STR LPR PEG UNS
count 398.00000 398.000000 398.000000 398.000000 398.000000 398
unique NaN NaN NaN NaN NaN 4
top NaN NaN NaN NaN NaN Low
freq NaN NaN NaN NaN NaN 129
mean 0.34602 0.354332 0.459937 0.429751 0.451490 NaN
std 0.20348 0.215366 0.246517 0.257527 0.264638 NaN
min 0.00000 0.000000 0.000000 0.000000 0.000000 NaN
25% 0.20000 0.200000 0.280000 0.250000 0.250000 NaN
50% 0.30000 0.300000 0.450000 0.330000 0.375000 NaN
75% 0.46375 0.510000 0.680000 0.650000 0.660000 NaN
max 0.89000 0.900000 0.950000 0.990000 0.990000 NaN
In [1154]:
# Plot outliers in box plot for X variables
# boxplot = inputData.boxplot(column=inputData.columns['Distance', 'Cost','Discount','Delivery charges','Surge charges','Packaging charges','ST','Tip'])
dfBoxPlot=inputDataPlay.select_dtypes(include='number')
dfBoxPlot.info()
boxplot = dfBoxPlot.boxplot(column=dfBoxPlot.columns.values.tolist())#[ :-1]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 398 entries, 0 to 402
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   STG     398 non-null    float64
 1   SCG     398 non-null    float64
 2   STR     398 non-null    float64
 3   LPR     398 non-null    float64
 4   PEG     398 non-null    float64
dtypes: float64(5)
memory usage: 18.7 KB
In [1155]:
inputDataPlay.boxplot(layout=(2,3), by='UNS', figsize=[15,10])
Out[1155]:
array([[<AxesSubplot:title={'center':'LPR'}, xlabel='[UNS]'>,
        <AxesSubplot:title={'center':'PEG'}, xlabel='[UNS]'>,
        <AxesSubplot:title={'center':'SCG'}, xlabel='[UNS]'>],
       [<AxesSubplot:title={'center':'STG'}, xlabel='[UNS]'>,
        <AxesSubplot:title={'center':'STR'}, xlabel='[UNS]'>,
        <AxesSubplot:>]], dtype=object)

5.10 Underake Label Encoding

Convert category values into integer values

In [1156]:
columns = ['UNS']

for col in columns:
    inputDataPlay[col] = inputDataPlay[col].astype('category')
In [1157]:
inputDataPlay.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 398 entries, 0 to 402
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   STG     398 non-null    float64 
 1   SCG     398 non-null    float64 
 2   STR     398 non-null    float64 
 3   LPR     398 non-null    float64 
 4   PEG     398 non-null    float64 
 5   UNS     398 non-null    category
dtypes: category(1), float64(5)
memory usage: 19.2 KB
In [1158]:
inputDataPlay["UNS"] = inputDataPlay.UNS.map({"Very Low":0,"Low":1,"Middle":2,"High":3})
In [1159]:
inputDataPlay.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 398 entries, 0 to 402
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   STG     398 non-null    float64 
 1   SCG     398 non-null    float64 
 2   STR     398 non-null    float64 
 3   LPR     398 non-null    float64 
 4   PEG     398 non-null    float64 
 5   UNS     398 non-null    category
dtypes: category(1), float64(5)
memory usage: 19.2 KB
In [1160]:
inputDataPlay.head(10)
Out[1160]:
STG SCG STR LPR PEG UNS
0 0.00 0.00 0.00 0.00 0.00 0
1 0.08 0.08 0.10 0.24 0.90 3
2 0.06 0.06 0.05 0.25 0.33 1
3 0.10 0.10 0.15 0.65 0.30 2
4 0.08 0.08 0.08 0.98 0.24 1
5 0.09 0.15 0.40 0.10 0.66 2
6 0.10 0.10 0.43 0.29 0.56 2
7 0.15 0.02 0.34 0.40 0.01 0
8 0.20 0.14 0.35 0.72 0.25 1
9 0.00 0.00 0.50 0.20 0.85 3

6.0 Q-3: Perform stratified splitting of train and test data to ensure similar class distribution. 1Mark

In [1161]:
X = inputDataPlay.loc[:, ~inputDataPlay.columns.isin(['UNS'])]
X
Out[1161]:
STG SCG STR LPR PEG
0 0.00 0.00 0.00 0.00 0.00
1 0.08 0.08 0.10 0.24 0.90
2 0.06 0.06 0.05 0.25 0.33
3 0.10 0.10 0.15 0.65 0.30
4 0.08 0.08 0.08 0.98 0.24
... ... ... ... ... ...
397 0.68 0.61 0.34 0.31 0.23
399 0.85 0.82 0.66 0.83 0.83
400 0.56 0.60 0.77 0.13 0.32
401 0.66 0.68 0.81 0.57 0.57
402 0.68 0.64 0.79 0.97 0.24

398 rows × 5 columns

In [1162]:
y = inputDataPlay['UNS']
y
Out[1162]:
0      0
1      3
2      1
3      2
4      1
      ..
397    1
399    3
400    1
401    2
402    2
Name: UNS, Length: 398, dtype: category
Categories (4, int64): [3, 1, 2, 0]
In [1163]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

X_train.head(5)
Out[1163]:
STG SCG STR LPR PEG
68 0.280 0.100 0.12 0.28 0.32
280 0.110 0.260 0.56 0.68 0.27
158 0.465 0.258 0.73 0.18 0.59
115 0.285 0.640 0.18 0.61 0.45
292 0.140 0.380 0.59 0.11 0.32
In [1164]:
#create train and test dataframes for ChefBoost implementation using Gain Ratio
dfTrain = pd.concat([X_train, y_train], axis=1)
dfTrain['UNS'] = dfTrain.UNS.astype(str)
dfTrain.head(5)
Out[1164]:
STG SCG STR LPR PEG UNS
68 0.280 0.100 0.12 0.28 0.32 1
280 0.110 0.260 0.56 0.68 0.27 1
158 0.465 0.258 0.73 0.18 0.59 2
115 0.285 0.640 0.18 0.61 0.45 2
292 0.140 0.380 0.59 0.11 0.32 1
In [1165]:
dfTrain.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 278 entries, 68 to 332
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   STG     278 non-null    float64
 1   SCG     278 non-null    float64
 2   STR     278 non-null    float64
 3   LPR     278 non-null    float64
 4   PEG     278 non-null    float64
 5   UNS     278 non-null    object 
dtypes: float64(5), object(1)
memory usage: 15.2+ KB
In [1166]:
dfTest = pd.concat([X_test, y_test], axis=1)
dfTest['UNS'] = dfTest.UNS.astype(str)
dfTest.head(5)
Out[1166]:
STG SCG STR LPR PEG UNS
335 0.32 0.20 0.84 0.81 0.80 3
100 0.27 0.28 0.18 0.48 0.26 1
249 0.72 0.60 0.45 0.79 0.45 2
108 0.32 0.27 0.52 0.81 0.30 2
319 0.29 0.10 0.17 0.74 0.52 2
In [1167]:
dfTest.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 120 entries, 335 to 288
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   STG     120 non-null    float64
 1   SCG     120 non-null    float64
 2   STR     120 non-null    float64
 3   LPR     120 non-null    float64
 4   PEG     120 non-null    float64
 5   UNS     120 non-null    object 
dtypes: float64(5), object(1)
memory usage: 6.6+ KB

6.1 Apply SMOTE to enable class balance

In [1168]:
y_train.value_counts()
Out[1168]:
1    90
2    85
3    68
0    35
Name: UNS, dtype: int64
In [1169]:
smot = RandomOverSampler(random_state=42)
X_train_smote,y_train_smote = smot.fit_resample(X_train,y_train)
y_train_smote.value_counts()
Out[1169]:
3    90
1    90
2    90
0    90
Name: UNS, dtype: int64
In [1170]:
#create train and test dataframes for ChefBoost implementation using Gain Ratio
dfTrain_smote = pd.concat([X_train_smote, y_train_smote], axis=1)
dfTrain_smote['UNS'] = dfTrain_smote.UNS.astype(str)
dfTrain_smote.head(5)
Out[1170]:
STG SCG STR LPR PEG UNS
0 0.280 0.100 0.12 0.28 0.32 1
1 0.110 0.260 0.56 0.68 0.27 1
2 0.465 0.258 0.73 0.18 0.59 2
3 0.285 0.640 0.18 0.61 0.45 2
4 0.140 0.380 0.59 0.11 0.32 1
In [1171]:
dfTrain_smote.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360 entries, 0 to 359
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   STG     360 non-null    float64
 1   SCG     360 non-null    float64
 2   STR     360 non-null    float64
 3   LPR     360 non-null    float64
 4   PEG     360 non-null    float64
 5   UNS     360 non-null    object 
dtypes: float64(5), object(1)
memory usage: 17.0+ KB

7.0 Q-4: Build a classifier model using gini index and Decision Tree algorithm and plot the same. - (1+1)Marks

7.1 Define the Functions for Creating Decision Tree Models

In [1172]:
labels=['STG', 'SCG','STR','LPR','PEG']
classes=['Very Low','Low','Middle','High']
In [1173]:
### Create Array to store results
# create a Feature_Subset_Result array
dFFeature_Subset_Result = pd.DataFrame(columns=['Best Estimator', 'Model', 
                                                'Training Time', 'Testing Time', 
                                                'Training Set Accuracy', 'Testing Set Accuracy',
                                                'Training Set Confusion Matrix', 'Testing Set Confusion Matrix',
                                                'Classifiction Report', 'Parameters of best estimator',
                                                'Avg. Cross Validation Score of Best Estimator',
                                                'Total number of cross validation sets','FPR','TPR','P_FPR','P_TPR','ccp_alphas',
                                                'X_train','y_train','X_test','y_test'])
Feature_Subset_Result = []

Function to plot Confusion Matrix

In [1174]:
plt.rcParams["font.family"] = 'DejaVu Sans'

def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion Matrix', cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
# helper function
def plot_confusionmatrix(y_train_pred,y_train,dom):
    print(f'{dom} Confusion matrix')
    cf = confusion_matrix(y_train_pred,y_train)
    sns.heatmap(cf,annot=True,yticklabels=classes
               ,xticklabels=classes,cmap='Blues', fmt='g')
    plt.tight_layout()
    plt.show()

Generic function to run any model specified

In [1175]:
from datetime import datetime
def perform_model(model, X_train, y_train, X_test, y_test, class_labels, cm_normalize=True, \
                 print_cm=True, cm_cmap=plt.cm.Greens):
    
    # to store results at various phases
    results = dict()
    
    results['X_train'] = X_train
    results['y_train'] = y_train
    results['X_test'] = X_test
    results['y_test'] = y_test
    
    # time at which model starts training 
    train_start_time = datetime.now()
    print('training the model..')
    model.fit(X_train, y_train)
    print('Done \n \n')
    train_end_time = datetime.now()
    results['training_time'] =  train_end_time - train_start_time
    print('---------------------')
    print('|   Training Time    |')
    print('---------------------')
    print('training_time(HH:MM:SS.ms) - {}\n\n'.format(results['training_time']))
    results['model_name_'] = str(model)
    #print(str(model))
    
    
    # predict test data
    print('Predicting test data')
    test_start_time = datetime.now()
    y_pred = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    test_end_time = datetime.now()
    print('Done \n \n')
    results['testing_time'] = test_end_time - test_start_time
    print('---------------------')
    print('|   Testing Time    |')
    print('---------------------')
    print('testing time(HH:MM:SS:ms) - {}\n\n'.format(results['testing_time']))
    results['predicted'] = y_pred
   

    # calculate overall training accuracy of the model
    train_accuracy = metrics.accuracy_score(y_true=y_train, y_pred=y_pred_train)
    # store accuracy in results
    results['TrainingAccuracy'] = train_accuracy
    print('---------------------')
    print('| Training Set Accuracy |')
    print('---------------------')
    print('\n    {}\n\n'.format(train_accuracy))

    # calculate overall testing accuracy of the model
    test_accuracy = metrics.accuracy_score(y_true=y_test, y_pred=y_pred)
    # store accuracy in results
    results['TestingAccuracy'] = test_accuracy
    print('---------------------')
    print('| Testing Set Accuracy  |')
    print('---------------------')
    print('\n    {}\n\n'.format(test_accuracy))
          
    # Training set confusion matrix
    cm_train = metrics.confusion_matrix(y_train, y_pred_train)
    results['confusion_matrix_train'] = cm_train
    if print_cm: 
        print('-----------------------------')
        print('| Train Set Confusion Matrix |')
        print('-----------------------------')
        print('\n {}'.format(cm_train))
        
    # Test set confusion matrix
    cm_test = metrics.confusion_matrix(y_test, y_pred)
    results['confusion_matrix_test'] = cm_test
    if print_cm: 
        print('\n-----------------------------')
        print('| Test Set Confusion Matrix |')
        print('-----------------------------')
        print('\n {}'.format(cm_test))
        
    # plot train-set confusion matrix
    plt.figure(figsize=(8,8))
    plt.grid(b=False)
    #print(class_labels)
    plot_confusion_matrix(cm_train, class_labels, normalize=True, title='Train Set Normalized confusion matrix', cmap = cm_cmap)
    plt.show()
        
    # plot test-set confusion matrix
    plt.figure(figsize=(8,8))
    plt.grid(b=False)
    #print(class_labels)
    plot_confusion_matrix(cm_test, class_labels, normalize=True, title='Test Set Normalized confusion matrix', cmap = cm_cmap)
    plt.show()
    
    # get classification report
    print('-------------------------')
    print('| Classification Report |')
    print('-------------------------')
    classification_report = metrics.classification_report(y_test, y_pred)
    # store report in results
    results['classification_report'] = classification_report
    print(classification_report)
    
    # get ROC Score and Curve
    print('-------------------------')
    print('| ROC Curve |')
    print('-------------------------')
    # predict probabilities
    pred_prob = model.predict_proba(X_test)
    # roc curve for models
    fpr, tpr, thresh = roc_curve(y_test, pred_prob[:,1], pos_label=1)
    results['fpr'] = fpr
    results['tpr'] = tpr
    # roc curve for tpr = fpr (blue line with 50% area)
    random_probs = [0 for i in range(len(y_test))]
    p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)
    results['p_fpr'] = p_fpr
    results['p_tpr'] = p_tpr
    # auc scores
    # auc_score = roc_auc_score(y_test, pred_prob[:,1])
    # plot roc curves
    plt.plot(fpr, tpr, linestyle='--',color='orange', label="")
    plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
    # title
    plt.title('ROC curve')
    # x label
    plt.xlabel('False Positive Rate')
    # y label
    plt.ylabel('True Positive rate')
    plt.legend(loc='best')
    plt.savefig('ROC',dpi=300)
    plt.rcParams['figure.figsize'] = [10, 6]
    plt.show();
    
    # add the trained  model to the results
    results['model'] = model
    
    # Plot Decision Tree via. Plot_tree
    print('------------------------------------')
    print('|   Decision Tree via. Plot_tree   |')
    print('------------------------------------')
    final_model = model.best_estimator_
    final_model_fit = final_model.fit(X_test, y_test)
    #final_model.fit(X_train, y_train)
    #y_predict = final_model.predict(X_test)
    _, ax = plt.subplots(figsize=(30,30)) # Resize figure
    #fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=300)
    tree.plot_tree(final_model_fit, filled=True, feature_names = labels, ax=ax)
    plt.title("Decision trees")
    #plt.figure(figsize=(40,40))
    plt.show()

    # Plot Decision Tree via. graphviz
    print('------------------------------------')
    print('|   Decision Tree via. graphviz   |')
    print('------------------------------------')
    #dot_data = export_graphviz(model.best_estimator_, out_file=None, filled=True, rounded=True, feature_names=labels) #, class_names=['0','1','2']
    dot_data = tree.export_graphviz(final_model_fit, feature_names = labels, filled = True, rounded=True, special_characters=True, out_file=None)
    graph = graphviz.Source(dot_data)  
    display(graph)
    
    print('--------------------------------------')
    print('|      Decision Tree Description     |')
    print('--------------------------------------')
    n_nodes = final_model_fit.tree_.node_count
    children_left = final_model_fit.tree_.children_left
    children_right = final_model_fit.tree_.children_right
    feature = final_model_fit.tree_.feature
    threshold = final_model_fit.tree_.threshold
    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, 0)]  # start with the root node id (0) and its depth (0)
    while len(stack) > 0:
        # `pop` ensures each node is only visited once
        node_id, depth = stack.pop()
        node_depth[node_id] = depth

        # If the left and right child of a node is not the same we have a split
        # node
        is_split_node = children_left[node_id] != children_right[node_id]
        # If a split node, append left and right children and depth to `stack`
        # so we can loop through them
        if is_split_node:
            stack.append((children_left[node_id], depth + 1))
            stack.append((children_right[node_id], depth + 1))
        else:
            is_leaves[node_id] = True

    print(
        "The binary tree structure has {n} nodes and has "
        "the following tree structure:\n".format(n=n_nodes)
    )
    for i in range(n_nodes):
        if is_leaves[i]:
            print(
                "{space}node={node} is a leaf node.".format(
                    space=node_depth[i] * "\t", node=i
                )
            )
        else:
            print(
                "{space}node={node} is a split node: "
                "go to node {left} if X[:, {feature}] <= {threshold} "
                "else to node {right}.".format(
                    space=node_depth[i] * "\t",
                    node=i,
                    left=children_left[i],
                    feature=feature[i],
                    threshold=threshold[i],
                    right=children_right[i],
                )
            )
    
    path = final_model.cost_complexity_pruning_path(X_train, y_train)
    ccp_alphas, impurities = path.ccp_alphas, path.impurities
    print('\n--------------------------------------')
    print('|     ccp_alphas for Post Pruning     |')
    print('--------------------------------------\n')
    print(ccp_alphas)
    results['ccp_alphas'] = ccp_alphas
    
    return results

Method to print the gridsearch Attributes

In [1176]:
def print_grid_search_attributes(log_reg_grid_results):
    model = log_reg_grid_results['model']
    
    # Estimator that gave highest score among all the estimators formed in GridSearch
    print('--------------------------')
    print('|      Best Estimator     |')
    print('--------------------------')
    print('\n\t{}\n'.format(model.best_estimator_))


    # parameters that gave best results while performing grid search
    print('--------------------------')
    print('|     Best parameters     |')
    print('--------------------------')
    print('\tParameters of best estimator : \n\n\t{}\n'.format(model.best_params_))


    #  number of cross validation splits
    print('---------------------------------')
    print('|   No of CrossValidation sets   |')
    print('--------------------------------')
    print('\n\tTotal number of cross validation sets: {}\n'.format(model.n_splits_))


    # Average cross validated score of the best estimator, from the Grid Search 
    print('--------------------------')
    print('|        Best Score       |')
    print('--------------------------')
    print('\n\tAverage Cross Validate scores of best estimator : \n\n\t{}\n'.format(model.best_score_))
    
    Feature_Subset_Result.append((model.best_estimator_, str(model), log_reg_grid_results['training_time'],  log_reg_grid_results['testing_time'],
                                  log_reg_grid_results['TrainingAccuracy'], log_reg_grid_results['TestingAccuracy'],
                                  log_reg_grid_results['confusion_matrix_train'], log_reg_grid_results['confusion_matrix_test'],
                                  log_reg_grid_results['classification_report'],
                                  model.best_params_, model.best_score_, model.n_splits_, log_reg_grid_results['fpr'], log_reg_grid_results['tpr'],
                                  log_reg_grid_results['p_fpr'], log_reg_grid_results['p_tpr'],
                                  log_reg_grid_results['ccp_alphas'],
                                  log_reg_grid_results['X_train'],log_reg_grid_results['y_train'],log_reg_grid_results['X_test'],log_reg_grid_results['y_test']))

7.2 Calculate the best Decision Tree Model using GridSearchCV using Gini Criterion with Original Data

In [1177]:
# start Grid search with Gini Criterion
parameters = {'criterion': ['gini'], 
              'splitter': ['best', 'random'], 
              'max_depth':[None],
              'min_samples_split':[2],
              'min_samples_leaf':[1],
              'min_weight_fraction_leaf':[0.0],
              'max_features':['auto', 'sqrt', 'log2'],
              'random_state':[0],
              'max_leaf_nodes':[None],
              'min_impurity_decrease':[0.0],
              'class_weight':[None,'balanced'],
              'ccp_alpha':[0.0]}
des_tree_gini = DecisionTreeClassifier()
des_tree_gini_grid = GridSearchCV(des_tree_gini, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_gini_grid_results = perform_model(des_tree_gini_grid, X_train, y_train, X_test, y_test, labels)
# observe the attributes of the model 
print_grid_search_attributes(des_tree_gini_grid_results)
training the model..
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Done 
 

---------------------
|   Training Time    |
---------------------
training_time(HH:MM:SS.ms) - 0:00:08.519116


Predicting test data
Done 
 

---------------------
|   Testing Time    |
---------------------
testing time(HH:MM:SS:ms) - 0:00:00.006995


---------------------
| Training Set Accuracy |
---------------------

    1.0


---------------------
| Testing Set Accuracy  |
---------------------

    0.8833333333333333


-----------------------------
| Train Set Confusion Matrix |
-----------------------------

 [[35  0  0  0]
 [ 0 90  0  0]
 [ 0  0 85  0]
 [ 0  0  0 68]]

-----------------------------
| Test Set Confusion Matrix |
-----------------------------

 [[12  3  0  0]
 [ 2 35  2  0]
 [ 0  5 31  1]
 [ 0  0  1 28]]
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
-------------------------
| Classification Report |
-------------------------
              precision    recall  f1-score   support

           0       0.86      0.80      0.83        15
           1       0.81      0.90      0.85        39
           2       0.91      0.84      0.87        37
           3       0.97      0.97      0.97        29

    accuracy                           0.88       120
   macro avg       0.89      0.88      0.88       120
weighted avg       0.89      0.88      0.88       120

-------------------------
| ROC Curve |
-------------------------
------------------------------------
|   Decision Tree via. Plot_tree   |
------------------------------------
------------------------------------
|   Decision Tree via. graphviz   |
------------------------------------
Tree 0 PEG ≤ 0.69 gini = 0.75 samples = 120 value = [30.0, 30.0, 30.0, 30.0] 1 PEG ≤ 0.335 gini = 0.687 samples = 94 value = [30.0, 30.0, 30.0, 3.103] 0->1 True 46 gini = -0.0 samples = 26 value = [0.0, 0.0, 0.0, 26.897] 0->46 False 2 LPR ≤ 0.265 gini = 0.567 samples = 59 value = [30.0, 29.231, 4.865, 0.0] 1->2 37 LPR ≤ 0.855 gini = 0.237 samples = 35 value = [0.0, 0.769, 25.135, 3.103] 1->37 3 PEG ≤ 0.27 gini = 0.316 samples = 18 value = [22.0, 5.385, 0.0, 0.0] 2->3 12 PEG ≤ 0.13 gini = 0.513 samples = 41 value = [8.0, 23.846, 4.865, 0.0] 2->12 4 PEG ≤ 0.22 gini = 0.065 samples = 12 value = [22.0, 0.769, 0.0, 0.0] 3->4 9 PEG ≤ 0.3 gini = 0.0 samples = 6 value = [0.0, 4.615, 0.0, 0.0] 3->9 5 gini = 0.0 samples = 9 value = [18, 0, 0, 0] 4->5 6 STG ≤ 0.45 gini = 0.271 samples = 3 value = [4.0, 0.769, 0.0, 0.0] 4->6 7 gini = 0.0 samples = 2 value = [4, 0, 0, 0] 6->7 8 gini = -0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 6->8 10 gini = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 9->10 11 gini = 0.0 samples = 5 value = [0.0, 3.846, 0.0, 0.0] 9->11 13 LPR ≤ 0.665 gini = 0.271 samples = 6 value = [8.0, 1.538, 0.0, 0.0] 12->13 18 LPR ≤ 0.79 gini = 0.294 samples = 35 value = [0.0, 22.308, 4.865, 0.0] 12->18 14 gini = 0.0 samples = 4 value = [8, 0, 0, 0] 13->14 15 STG ≤ 0.23 gini = 0.0 samples = 2 value = [0.0, 1.538, 0.0, 0.0] 13->15 16 gini = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 15->16 17 gini = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 15->17 19 STR ≤ 0.805 gini = 0.182 samples = 31 value = [0.0, 21.538, 2.432, 0.0] 18->19 34 STR ≤ 0.115 gini = 0.365 samples = 4 value = [0.0, 0.769, 2.432, 0.0] 18->34 20 LPR ≤ 0.64 gini = 0.134 samples = 29 value = [0.0, 20.769, 1.622, 0.0] 19->20 31 PEG ≤ 0.225 gini = 0.5 samples = 2 value = [0.0, 0.769, 0.811, 0.0] 19->31 21 SCG ≤ 0.055 gini = 0.0 samples = 22 value = [0.0, 16.923, 0.0, 0.0] 20->21 24 LPR ≤ 0.705 gini = 0.417 samples = 7 value = [0.0, 3.846, 1.622, 0.0] 20->24 22 gini = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 21->22 23 gini = 0.0 samples = 21 value = [0.0, 16.154, 0.0, 0.0] 21->23 25 SCG ≤ 0.33 gini = 0.436 samples = 3 value = [0.0, 0.769, 1.622, 0.0] 24->25 30 gini = -0.0 samples = 4 value = [0.0, 3.077, 0.0, 0.0] 24->30 26 LPR ≤ 0.67 gini = 0.0 samples = 2 value = [0.0, 0.0, 1.622, 0.0] 25->26 29 gini = -0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 25->29 27 gini = 0.0 samples = 1 value = [0.0, 0.0, 0.811, 0.0] 26->27 28 gini = 0.0 samples = 1 value = [0.0, 0.0, 0.811, 0.0] 26->28 32 gini = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 31->32 33 gini = 0.0 samples = 1 value = [0.0, 0.0, 0.811, 0.0] 31->33 35 gini = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 34->35 36 gini = 0.0 samples = 3 value = [0.0, 0.0, 2.432, 0.0] 34->36 38 STG ≤ 0.135 gini = 0.058 samples = 32 value = [0.0, 0.769, 25.135, 0.0] 37->38 43 STR ≤ 0.555 gini = 0.0 samples = 3 value = [0.0, 0.0, 0.0, 3.103] 37->43 39 SCG ≤ 0.35 gini = 0.365 samples = 4 value = [0.0, 0.769, 2.432, 0.0] 38->39 42 gini = 0.0 samples = 28 value = [0.0, 0.0, 22.703, 0.0] 38->42 40 gini = 0.0 samples = 3 value = [0.0, 0.0, 2.432, 0.0] 39->40 41 gini = -0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 39->41 44 gini = 0.0 samples = 1 value = [0.0, 0.0, 0.0, 1.034] 43->44 45 gini = 0.0 samples = 2 value = [0.0, 0.0, 0.0, 2.069] 43->45
--------------------------------------
|      Decision Tree Description     |
--------------------------------------
The binary tree structure has 47 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 4] <= 0.6899999976158142 else to node 46.
	node=1 is a split node: go to node 2 if X[:, 4] <= 0.33500000834465027 else to node 37.
		node=2 is a split node: go to node 3 if X[:, 3] <= 0.26500000059604645 else to node 12.
			node=3 is a split node: go to node 4 if X[:, 4] <= 0.26999999582767487 else to node 9.
				node=4 is a split node: go to node 5 if X[:, 4] <= 0.2199999988079071 else to node 6.
					node=5 is a leaf node.
					node=6 is a split node: go to node 7 if X[:, 0] <= 0.44999998807907104 else to node 8.
						node=7 is a leaf node.
						node=8 is a leaf node.
				node=9 is a split node: go to node 10 if X[:, 4] <= 0.29999999701976776 else to node 11.
					node=10 is a leaf node.
					node=11 is a leaf node.
			node=12 is a split node: go to node 13 if X[:, 4] <= 0.12999999895691872 else to node 18.
				node=13 is a split node: go to node 14 if X[:, 3] <= 0.6649999916553497 else to node 15.
					node=14 is a leaf node.
					node=15 is a split node: go to node 16 if X[:, 0] <= 0.23000000417232513 else to node 17.
						node=16 is a leaf node.
						node=17 is a leaf node.
				node=18 is a split node: go to node 19 if X[:, 3] <= 0.7899999916553497 else to node 34.
					node=19 is a split node: go to node 20 if X[:, 2] <= 0.8050000071525574 else to node 31.
						node=20 is a split node: go to node 21 if X[:, 3] <= 0.6399999856948853 else to node 24.
							node=21 is a split node: go to node 22 if X[:, 1] <= 0.054999999701976776 else to node 23.
								node=22 is a leaf node.
								node=23 is a leaf node.
							node=24 is a split node: go to node 25 if X[:, 3] <= 0.7050000131130219 else to node 30.
								node=25 is a split node: go to node 26 if X[:, 1] <= 0.32999999821186066 else to node 29.
									node=26 is a split node: go to node 27 if X[:, 3] <= 0.6699999868869781 else to node 28.
										node=27 is a leaf node.
										node=28 is a leaf node.
									node=29 is a leaf node.
								node=30 is a leaf node.
						node=31 is a split node: go to node 32 if X[:, 4] <= 0.22499999403953552 else to node 33.
							node=32 is a leaf node.
							node=33 is a leaf node.
					node=34 is a split node: go to node 35 if X[:, 2] <= 0.11500000208616257 else to node 36.
						node=35 is a leaf node.
						node=36 is a leaf node.
		node=37 is a split node: go to node 38 if X[:, 3] <= 0.8549999892711639 else to node 43.
			node=38 is a split node: go to node 39 if X[:, 0] <= 0.13499999791383743 else to node 42.
				node=39 is a split node: go to node 40 if X[:, 1] <= 0.3500000089406967 else to node 41.
					node=40 is a leaf node.
					node=41 is a leaf node.
				node=42 is a leaf node.
			node=43 is a split node: go to node 44 if X[:, 2] <= 0.5550000071525574 else to node 45.
				node=44 is a leaf node.
				node=45 is a leaf node.
	node=46 is a leaf node.

--------------------------------------
|     ccp_alphas for Post Pruning     |
--------------------------------------

[0.00000000e+00 1.95921710e-18 2.61228947e-18 3.70074342e-18
 1.24083750e-17 2.11267606e-03 2.42718447e-03 2.51805559e-03
 2.80000000e-03 2.86944046e-03 2.87356322e-03 3.68307155e-03
 4.13223140e-03 4.60139305e-03 4.65116279e-03 4.71338240e-03
 5.19031142e-03 6.13872706e-03 7.54046548e-03 8.06476542e-03
 1.05202714e-02 1.48178638e-02 3.48125409e-02 9.50869721e-02
 1.69846939e-01 2.10651925e-01]
--------------------------
|      Best Estimator     |
--------------------------

	DecisionTreeClassifier(class_weight='balanced', max_features='auto',
                       random_state=0)

--------------------------
|     Best parameters     |
--------------------------
	Parameters of best estimator : 

	{'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 0, 'splitter': 'best'}

---------------------------------
|   No of CrossValidation sets   |
--------------------------------

	Total number of cross validation sets: 3

--------------------------
|        Best Score       |
--------------------------

	Average Cross Validate scores of best estimator : 

	0.906459404706249

7.3 Calculate the best Decision Tree Model using GridSearchCV using Gini Criterion with Balaced Data using SMOTE

In [1178]:
# start Grid search with Gini Criterion
parameters = {'criterion': ['gini'], 
              'splitter': ['best', 'random'], 
              'max_depth':[None],
              'min_samples_split':[2],
              'min_samples_leaf':[1],
              'min_weight_fraction_leaf':[0.0],
              'max_features':['auto', 'sqrt', 'log2'],
              'random_state':[0],
              'max_leaf_nodes':[None],
              'min_impurity_decrease':[0.0],
              'class_weight':[None,'balanced'],
              'ccp_alpha':[0.0]}
des_tree_gini = DecisionTreeClassifier()
des_tree_gini_grid = GridSearchCV(des_tree_gini, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_gini_grid_results = perform_model(des_tree_gini_grid, X_train_smote, y_train_smote, X_test, y_test, labels)
# observe the attributes of the model 
print_grid_search_attributes(des_tree_gini_grid_results)
training the model..
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Done 
 

---------------------
|   Training Time    |
---------------------
training_time(HH:MM:SS.ms) - 0:00:00.327816


Predicting test data
Done 
 

---------------------
|   Testing Time    |
---------------------
testing time(HH:MM:SS:ms) - 0:00:00.006993


---------------------
| Training Set Accuracy |
---------------------

    1.0


---------------------
| Testing Set Accuracy  |
---------------------

    0.8916666666666667


-----------------------------
| Train Set Confusion Matrix |
-----------------------------

 [[90  0  0  0]
 [ 0 90  0  0]
 [ 0  0 90  0]
 [ 0  0  0 90]]

-----------------------------
| Test Set Confusion Matrix |
-----------------------------

 [[12  3  0  0]
 [ 2 36  1  0]
 [ 0  3 32  2]
 [ 0  0  2 27]]
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
-------------------------
| Classification Report |
-------------------------
              precision    recall  f1-score   support

           0       0.86      0.80      0.83        15
           1       0.86      0.92      0.89        39
           2       0.91      0.86      0.89        37
           3       0.93      0.93      0.93        29

    accuracy                           0.89       120
   macro avg       0.89      0.88      0.88       120
weighted avg       0.89      0.89      0.89       120

-------------------------
| ROC Curve |
-------------------------
------------------------------------
|   Decision Tree via. Plot_tree   |
------------------------------------
------------------------------------
|   Decision Tree via. graphviz   |
------------------------------------
Tree 0 PEG ≤ 0.69 gini = 0.725 samples = 120 value = [15, 39, 37, 29] 1 PEG ≤ 0.415 gini = 0.646 samples = 94 value = [15, 39, 37, 3] 0->1 True 40 gini = 0.0 samples = 26 value = [0, 0, 0, 26] 0->40 False 2 LPR ≤ 0.105 gini = 0.518 samples = 61 value = [15, 39, 7, 0] 1->2 37 LPR ≤ 0.835 gini = 0.165 samples = 33 value = [0, 0, 30, 3] 1->37 3 PEG ≤ 0.29 gini = 0.219 samples = 8 value = [7, 1, 0, 0] 2->3 6 PEG ≤ 0.13 gini = 0.446 samples = 53 value = [8, 38, 7, 0] 2->6 4 gini = 0.0 samples = 7 value = [7, 0, 0, 0] 3->4 5 gini = 0.0 samples = 1 value = [0, 1, 0, 0] 3->5 7 STG ≤ 0.41 gini = 0.375 samples = 8 value = [6, 2, 0, 0] 6->7 16 STG ≤ 0.72 gini = 0.334 samples = 45 value = [2, 36, 7, 0] 6->16 8 PEG ≤ 0.06 gini = 0.48 samples = 5 value = [3, 2, 0, 0] 7->8 15 gini = 0.0 samples = 3 value = [3, 0, 0, 0] 7->15 9 gini = 0.0 samples = 1 value = [0, 1, 0, 0] 8->9 10 PEG ≤ 0.09 gini = 0.375 samples = 4 value = [3, 1, 0, 0] 8->10 11 gini = 0.0 samples = 2 value = [2, 0, 0, 0] 10->11 12 LPR ≤ 0.635 gini = 0.5 samples = 2 value = [1, 1, 0, 0] 10->12 13 gini = 0.0 samples = 1 value = [1, 0, 0, 0] 12->13 14 gini = 0.0 samples = 1 value = [0, 1, 0, 0] 12->14 17 LPR ≤ 0.79 gini = 0.31 samples = 44 value = [2, 36, 6, 0] 16->17 36 gini = 0.0 samples = 1 value = [0, 0, 1, 0] 16->36 18 STR ≤ 0.575 gini = 0.226 samples = 40 value = [2, 35, 3, 0] 17->18 33 STR ≤ 0.115 gini = 0.375 samples = 4 value = [0, 1, 3, 0] 17->33 19 SCG ≤ 0.23 gini = 0.135 samples = 28 value = [1, 26, 1, 0] 18->19 26 SCG ≤ 0.125 gini = 0.403 samples = 12 value = [1, 9, 2, 0] 18->26 20 SCG ≤ 0.15 gini = 0.5 samples = 6 value = [1, 4, 1, 0] 19->20 25 gini = 0.0 samples = 22 value = [0, 22, 0, 0] 19->25 21 LPR ≤ 0.625 gini = 0.32 samples = 5 value = [0, 4, 1, 0] 20->21 24 gini = 0.0 samples = 1 value = [1, 0, 0, 0] 20->24 22 gini = 0.0 samples = 4 value = [0, 4, 0, 0] 21->22 23 gini = 0.0 samples = 1 value = [0, 0, 1, 0] 21->23 27 gini = 0.0 samples = 1 value = [1, 0, 0, 0] 26->27 28 LPR ≤ 0.465 gini = 0.298 samples = 11 value = [0, 9, 2, 0] 26->28 29 gini = 0.0 samples = 6 value = [0, 6, 0, 0] 28->29 30 PEG ≤ 0.235 gini = 0.48 samples = 5 value = [0, 3, 2, 0] 28->30 31 gini = 0.0 samples = 3 value = [0, 3, 0, 0] 30->31 32 gini = 0.0 samples = 2 value = [0, 0, 2, 0] 30->32 34 gini = 0.0 samples = 1 value = [0, 1, 0, 0] 33->34 35 gini = 0.0 samples = 3 value = [0, 0, 3, 0] 33->35 38 gini = 0.0 samples = 30 value = [0, 0, 30, 0] 37->38 39 gini = 0.0 samples = 3 value = [0, 0, 0, 3] 37->39
--------------------------------------
|      Decision Tree Description     |
--------------------------------------
The binary tree structure has 41 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 4] <= 0.6899999976158142 else to node 40.
	node=1 is a split node: go to node 2 if X[:, 4] <= 0.41499999165534973 else to node 37.
		node=2 is a split node: go to node 3 if X[:, 3] <= 0.10500000044703484 else to node 6.
			node=3 is a split node: go to node 4 if X[:, 4] <= 0.2900000065565109 else to node 5.
				node=4 is a leaf node.
				node=5 is a leaf node.
			node=6 is a split node: go to node 7 if X[:, 4] <= 0.12999999895691872 else to node 16.
				node=7 is a split node: go to node 8 if X[:, 0] <= 0.4099999964237213 else to node 15.
					node=8 is a split node: go to node 9 if X[:, 4] <= 0.06000000052154064 else to node 10.
						node=9 is a leaf node.
						node=10 is a split node: go to node 11 if X[:, 4] <= 0.08999999985098839 else to node 12.
							node=11 is a leaf node.
							node=12 is a split node: go to node 13 if X[:, 3] <= 0.6349999904632568 else to node 14.
								node=13 is a leaf node.
								node=14 is a leaf node.
					node=15 is a leaf node.
				node=16 is a split node: go to node 17 if X[:, 0] <= 0.7199999988079071 else to node 36.
					node=17 is a split node: go to node 18 if X[:, 3] <= 0.7899999916553497 else to node 33.
						node=18 is a split node: go to node 19 if X[:, 2] <= 0.574999988079071 else to node 26.
							node=19 is a split node: go to node 20 if X[:, 1] <= 0.22999999672174454 else to node 25.
								node=20 is a split node: go to node 21 if X[:, 1] <= 0.15000000223517418 else to node 24.
									node=21 is a split node: go to node 22 if X[:, 3] <= 0.625 else to node 23.
										node=22 is a leaf node.
										node=23 is a leaf node.
									node=24 is a leaf node.
								node=25 is a leaf node.
							node=26 is a split node: go to node 27 if X[:, 1] <= 0.1250000037252903 else to node 28.
								node=27 is a leaf node.
								node=28 is a split node: go to node 29 if X[:, 3] <= 0.4650000035762787 else to node 30.
									node=29 is a leaf node.
									node=30 is a split node: go to node 31 if X[:, 4] <= 0.23499999195337296 else to node 32.
										node=31 is a leaf node.
										node=32 is a leaf node.
						node=33 is a split node: go to node 34 if X[:, 2] <= 0.11500000208616257 else to node 35.
							node=34 is a leaf node.
							node=35 is a leaf node.
					node=36 is a leaf node.
		node=37 is a split node: go to node 38 if X[:, 3] <= 0.8350000083446503 else to node 39.
			node=38 is a leaf node.
			node=39 is a leaf node.
	node=40 is a leaf node.

--------------------------------------
|     ccp_alphas for Post Pruning     |
--------------------------------------

[0.         0.00231481 0.00262346 0.00267857 0.00330688 0.0037037
 0.00396825 0.00416667 0.00457875 0.00458333 0.00462963 0.00493827
 0.00666667 0.00709085 0.00716963 0.00721501 0.00865973 0.00942761
 0.01082506 0.01446759 0.0144977  0.02105404 0.03338142 0.03508376
 0.16355609 0.21043512]
--------------------------
|      Best Estimator     |
--------------------------

	DecisionTreeClassifier(max_features='auto', random_state=0)

--------------------------
|     Best parameters     |
--------------------------
	Parameters of best estimator : 

	{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 0, 'splitter': 'best'}

---------------------------------
|   No of CrossValidation sets   |
--------------------------------

	Total number of cross validation sets: 3

--------------------------
|        Best Score       |
--------------------------

	Average Cross Validate scores of best estimator : 

	0.9277777777777777

8.0 Q-5: Build a classifier model using gain ratio and Decision Tree algorithm and plot the same - (1+1) Marks

8.1 Calculate the best Decision Tree Model using Chef Boost using C4.5 Criterion that works on Information Gain Ratio with Original Data

The following regular decision tree algorithms are wrapped in the ChefBoost library.

image.png

So we will use the C4.5 method of ChefBoost to use Gain Ratio and create our model

In [1179]:
# to store results at various phases
ChefBoostresults = dict()

ChefBoostresults['X_train'] = X_train
ChefBoostresults['y_train'] = y_train
ChefBoostresults['X_test'] = X_test
ChefBoostresults['y_test'] = y_test

#Set algorithm to ID3, C4.5, CART, CHAID or Regression
config = {'algorithm': 'C4.5', 'enableParallelism': [False], 'num_cores': [1]}
# time at which model starts training 
train_start_time = datetime.now()
print('training the model..')
model = chef.fit(dfTrain.copy(deep=True), config, target_label = 'UNS')
print('Done \n \n')
train_end_time = datetime.now()
ChefBoostresults['training_time'] =  train_end_time - train_start_time
print('---------------------')
print('|   Training Time    |')
print('---------------------')
print('training_time(HH:MM:SS.ms) - {}\n\n'.format(ChefBoostresults['training_time']))
training the model..
C4.5  tree is going to be built...
-------------------------
finished in  15.114295959472656  seconds
-------------------------
Evaluate  train set
-------------------------
Accuracy:  70.86330935251799 % on  278  instances
Labels:  ['1' '2' '0' '3']
Confusion matrix:  [[90, 15, 32, 0], [0, 69, 0, 33], [0, 0, 3, 0], [0, 1, 0, 35]]
Decision  1  => Accuray:  83.0935 %, Precision:  65.6934 %, Recall:  100.0 %, F1:  79.2951 %
Decision  2  => Accuray:  82.3741 %, Precision:  67.6471 %, Recall:  81.1765 %, F1:  73.7968 %
Decision  0  => Accuray:  88.4892 %, Precision:  100.0 %, Recall:  8.5714 %, F1:  15.7894 %
Decision  3  => Accuray:  87.7698 %, Precision:  97.2222 %, Recall:  51.4706 %, F1:  67.3077 %
Done 
 

---------------------
|   Training Time    |
---------------------
training_time(HH:MM:SS.ms) - 0:00:15.182259


Check Prediction for the first column

In [1180]:
dfTrain.iloc[0]
Out[1180]:
STG    0.28
SCG     0.1
STR    0.12
LPR    0.28
PEG    0.32
UNS       1
Name: 68, dtype: object
In [1181]:
prediction = chef.predict(model, dfTrain.iloc[0])
In [1182]:
prediction
Out[1182]:
'1'

Check Prediction for enire data set

In [1183]:
for index, instance in dfTrain.iterrows():
    prediction = chef.predict(model, instance)
    actual = instance['UNS']
    
    if actual == prediction:
        classified = True
    else:
        classified = False
        print("*",end='')
        #Mark predictions which are not right, with a * mark at the start
    
    print ("Actual: ", actual, "--Prediction: ", prediction, "--Error: ", int(prediction) - int(actual))
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  2 --Prediction:  1 --Error:  -1
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  0 --Prediction:  0 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
*Actual:  3 --Prediction:  2 --Error:  -1
*Actual:  2 --Prediction:  1 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  3 --Prediction:  3 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  2 --Prediction:  1 --Error:  -1
*Actual:  3 --Prediction:  2 --Error:  -1
*Actual:  2 --Prediction:  1 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  2 --Prediction:  1 --Error:  -1
*Actual:  2 --Prediction:  1 --Error:  -1
*Actual:  2 --Prediction:  1 --Error:  -1
*Actual:  2 --Prediction:  1 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  2 --Prediction:  1 --Error:  -1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
*Actual:  2 --Prediction:  3 --Error:  1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  0 --Prediction:  0 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  3 --Prediction:  3 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  3 --Prediction:  2 --Error:  -1
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  2 --Prediction:  1 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  2 --Prediction:  1 --Error:  -1
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  2 --Prediction:  1 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  2 --Prediction:  1 --Error:  -1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  0 --Prediction:  0 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  2 --Prediction:  1 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  2 --Prediction:  1 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0

Check Feature Importance

In [1184]:
rules = "outputs/rules/rules.py"
fi = chef.feature_importance(rules).set_index("feature")
fi.plot(kind="barh", title="Feature Importance");
print(fi)
Decision rule:  outputs/rules/rules.py
         importance
feature            
STG          0.6393
SCG          0.3225
LPR          0.3043
STR         -0.0025
PEG         -0.2636

This shows that the STG Feature is the most important feature, post which we have the LPR and SCG feature, after which is the PEG Feature.

Evaluate the model using dfTest

In [1185]:
# predict test data
print('Predicting test data')
test_start_time = datetime.now()
chef.evaluate(model, dfTest, task="test", target_label = 'UNS')
test_end_time = datetime.now()
print('Done \n \n')
ChefBoostresults['testing_time'] = test_end_time - test_start_time
print('---------------------')
print('|   Testing Time    |')
print('---------------------')
print('testing time(HH:MM:SS:ms) - {}\n\n'.format(ChefBoostresults['testing_time']))
Predicting test data
-------------------------
Evaluate  test set
-------------------------
Accuracy:  65.0 % on  120  instances
Labels:  ['3' '1' '2' '0']
Confusion matrix:  [[9, 0, 1, 0], [0, 38, 7, 13], [20, 1, 29, 0], [0, 0, 0, 2]]
Decision  3  => Accuray:  82.5 %, Precision:  90.0 %, Recall:  31.0345 %, F1:  46.1539 %
Decision  1  => Accuray:  82.5 %, Precision:  65.5172 %, Recall:  97.4359 %, F1:  78.3505 %
Decision  2  => Accuray:  75.8333 %, Precision:  58.0 %, Recall:  78.3784 %, F1:  66.6667 %
Decision  0  => Accuray:  89.1667 %, Precision:  100.0 %, Recall:  13.3333 %, F1:  23.5294 %
Done 
 

---------------------
|   Testing Time    |
---------------------
testing time(HH:MM:SS:ms) - 0:00:00.094945


In [1186]:
Feature_Subset_Result.append(("chef.fit", "chef.fit(dfTrain.copy(deep=True), config = {'algorithm': 'C4.5', 'enableParallelism': [False], 'num_cores': [1]}, target_label = 'UNS')", 
                              ChefBoostresults['training_time'],  ChefBoostresults['testing_time'],
                              "0.7086", "0.650",
                              "[[90, 15, 32, 0], [0, 69, 0, 33], [0, 0, 3, 0], [0, 1, 0, 35]]", "[[9, 0, 1, 0], [0, 38, 7, 13], [20, 1, 29, 0], [0, 0, 0, 2]]",
                              "-",
                              "", "", "", "", "",
                              "", "",
                              "",
                              ChefBoostresults['X_train'],ChefBoostresults['y_train'],ChefBoostresults['X_test'],ChefBoostresults['y_test']))

8.2 Calculate the best Decision Tree Model using Chef Boost using C4.5 Criterion that works on Information Gain Ratio with Balanced Data using SMOTE

So we will use the C4.5 method of ChefBoost to use Gain Ratio and create our model

In [1187]:
# to store results at various phases
ChefBoostresults = dict()

ChefBoostresults['X_train'] = X_train_smote
ChefBoostresults['y_train'] = y_train_smote
ChefBoostresults['X_test'] = X_test
ChefBoostresults['y_test'] = y_test

#Set algorithm to ID3, C4.5, CART, CHAID or Regression
config = {'algorithm': 'C4.5', 'enableParallelism': [False], 'num_cores': [1]}
# time at which model starts training 
train_start_time = datetime.now()
print('training the model..')
model = chef.fit(dfTrain_smote.copy(deep=True), config, target_label = 'UNS')
print('Done \n \n')
train_end_time = datetime.now()
ChefBoostresults['training_time'] =  train_end_time - train_start_time
print('---------------------')
print('|   Training Time    |')
print('---------------------')
print('training_time(HH:MM:SS.ms) - {}\n\n'.format(ChefBoostresults['training_time']))
training the model..
C4.5  tree is going to be built...
-------------------------
finished in  14.721278429031372  seconds
-------------------------
Evaluate  train set
-------------------------
Accuracy:  60.55555555555556 % on  360  instances
Labels:  ['1' '2' '0' '3']
Confusion matrix:  [[90, 17, 85, 0], [0, 72, 0, 39], [0, 0, 5, 0], [0, 1, 0, 51]]
Decision  1  => Accuray:  71.6667 %, Precision:  46.875 %, Recall:  100.0 %, F1:  63.8298 %
Decision  2  => Accuray:  84.1667 %, Precision:  64.8649 %, Recall:  80.0 %, F1:  71.6418 %
Decision  0  => Accuray:  76.3889 %, Precision:  100.0 %, Recall:  5.5556 %, F1:  10.5264 %
Decision  3  => Accuray:  88.8889 %, Precision:  98.0769 %, Recall:  56.6667 %, F1:  71.831 %
Done 
 

---------------------
|   Training Time    |
---------------------
training_time(HH:MM:SS.ms) - 0:00:14.784230


Check Prediction for the first column

In [1188]:
dfTrain_smote.iloc[0]
Out[1188]:
STG    0.28
SCG     0.1
STR    0.12
LPR    0.28
PEG    0.32
UNS       1
Name: 0, dtype: object
In [1189]:
prediction = chef.predict(model, dfTrain_smote.iloc[0])
In [1190]:
prediction
Out[1190]:
'1'

Check Prediction for enire data set

In [1191]:
for index, instance in dfTrain_smote.iterrows():
    prediction = chef.predict(model, instance)
    actual = instance['UNS']
    
    if actual == prediction:
        classified = True
    else:
        classified = False
        print("*",end='')
        #Mark predictions which are not right, with a * mark at the start
    
    print ("Actual: ", actual, "--Prediction: ", prediction, "--Error: ", int(prediction) - int(actual))
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  2 --Prediction:  1 --Error:  -1
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  0 --Prediction:  0 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
*Actual:  3 --Prediction:  2 --Error:  -1
*Actual:  2 --Prediction:  1 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  3 --Prediction:  3 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  2 --Prediction:  1 --Error:  -1
*Actual:  3 --Prediction:  2 --Error:  -1
*Actual:  2 --Prediction:  1 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  2 --Prediction:  1 --Error:  -1
*Actual:  2 --Prediction:  1 --Error:  -1
*Actual:  2 --Prediction:  1 --Error:  -1
*Actual:  2 --Prediction:  1 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  2 --Prediction:  1 --Error:  -1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
*Actual:  2 --Prediction:  3 --Error:  1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  0 --Prediction:  0 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  3 --Prediction:  3 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  3 --Prediction:  2 --Error:  -1
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  2 --Prediction:  1 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  2 --Prediction:  1 --Error:  -1
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  3 --Prediction:  2 --Error:  -1
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  2 --Prediction:  1 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  2 --Prediction:  1 --Error:  -1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  0 --Prediction:  0 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  2 --Prediction:  1 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  2 --Prediction:  1 --Error:  -1
Actual:  1 --Prediction:  1 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  0 --Prediction:  0 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  0 --Prediction:  0 --Error:  0
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
*Actual:  0 --Prediction:  1 --Error:  1
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
Actual:  2 --Prediction:  2 --Error:  0
*Actual:  2 --Prediction:  1 --Error:  -1
*Actual:  2 --Prediction:  1 --Error:  -1
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
Actual:  3 --Prediction:  3 --Error:  0
*Actual:  3 --Prediction:  2 --Error:  -1

Check Feature Importance

In [1192]:
rules = "outputs/rules/rules.py"
fi = chef.feature_importance(rules).set_index("feature")
fi.plot(kind="barh", title="Feature Importance");
print(fi)
Decision rule:  outputs/rules/rules.py
         importance
feature            
STG          0.6348
SCG          0.3207
LPR          0.2999
STR         -0.0020
PEG         -0.2534

This shows that the STG Feature is the most important feature, post which we have the LPR and SCG feature, after which is the PEG Feature.

Evaluate the model using dfTest

In [1193]:
# predict test data
print('Predicting test data')
test_start_time = datetime.now()
chef.evaluate(model, dfTest, task="test", target_label = 'UNS')
test_end_time = datetime.now()
print('Done \n \n')
ChefBoostresults['testing_time'] = test_end_time - test_start_time
print('---------------------')
print('|   Testing Time    |')
print('---------------------')
print('testing time(HH:MM:SS:ms) - {}\n\n'.format(ChefBoostresults['testing_time']))
Predicting test data
-------------------------
Evaluate  test set
-------------------------
Accuracy:  65.83333333333333 % on  120  instances
Labels:  ['3' '1' '2' '0']
Confusion matrix:  [[8, 0, 1, 0], [0, 38, 5, 13], [21, 1, 31, 0], [0, 0, 0, 2]]
Decision  3  => Accuray:  81.6667 %, Precision:  88.8889 %, Recall:  27.5862 %, F1:  42.1053 %
Decision  1  => Accuray:  84.1667 %, Precision:  67.8571 %, Recall:  97.4359 %, F1:  80.0 %
Decision  2  => Accuray:  76.6667 %, Precision:  58.4906 %, Recall:  83.7838 %, F1:  68.8889 %
Decision  0  => Accuray:  89.1667 %, Precision:  100.0 %, Recall:  13.3333 %, F1:  23.5294 %
Done 
 

---------------------
|   Testing Time    |
---------------------
testing time(HH:MM:SS:ms) - 0:00:00.056966


In [1194]:
Feature_Subset_Result.append(("chef.fit", "chef.fit(dfTrain_smote.copy(deep=True), config = {'algorithm': 'C4.5', 'enableParallelism': [False], 'num_cores': [1]}, target_label = 'UNS')", 
                              ChefBoostresults['training_time'],  ChefBoostresults['testing_time'],
                              "0.6055", "0.6583",
                              "[[90, 17, 85, 0], [0, 72, 0, 39], [0, 0, 5, 0], [0, 1, 0, 51]]", "[[8, 0, 1, 0], [0, 38, 5, 13], [21, 1, 31, 0], [0, 0, 0, 2]]",
                              "-",
                              "", "", "", "", "",
                              "", "",
                              "",
                              ChefBoostresults['X_train'],ChefBoostresults['y_train'],ChefBoostresults['X_test'],ChefBoostresults['y_test']))

8.3 Calculate the best Decision Tree Model using GridSearchCV using Entopy Criterion that works on Information Gain with Original Data

In [1195]:
# start Grid search with Entropy Criterion
parameters = {'criterion': ['entropy'], 
              'splitter': ['best', 'random'], 
              'max_depth':[None],
              'min_samples_split':[2],
              'min_samples_leaf':[1],
              'min_weight_fraction_leaf':[0.0],
              'max_features':['auto', 'sqrt', 'log2'],
              'random_state':[0],
              'max_leaf_nodes':[None],
              'min_impurity_decrease':[0.0],
              'class_weight':[None,'balanced'],
              'ccp_alpha':[0.0]}
des_tree_entropy = DecisionTreeClassifier()
des_tree_entropy_grid = GridSearchCV(des_tree_entropy, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_entropy_grid_results = perform_model(des_tree_entropy_grid, X_train, y_train, X_test, y_test, labels)
# observe the attributes of the model 
print_grid_search_attributes(des_tree_entropy_grid_results)
training the model..
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Done 
 

---------------------
|   Training Time    |
---------------------
training_time(HH:MM:SS.ms) - 0:00:00.241866


Predicting test data
Done 
 

---------------------
|   Testing Time    |
---------------------
testing time(HH:MM:SS:ms) - 0:00:00.004998


---------------------
| Training Set Accuracy |
---------------------

    1.0


---------------------
| Testing Set Accuracy  |
---------------------

    0.9166666666666666


-----------------------------
| Train Set Confusion Matrix |
-----------------------------

 [[35  0  0  0]
 [ 0 90  0  0]
 [ 0  0 85  0]
 [ 0  0  0 68]]

-----------------------------
| Test Set Confusion Matrix |
-----------------------------

 [[12  3  0  0]
 [ 2 36  1  0]
 [ 0  3 33  1]
 [ 0  0  0 29]]
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
-------------------------
| Classification Report |
-------------------------
              precision    recall  f1-score   support

           0       0.86      0.80      0.83        15
           1       0.86      0.92      0.89        39
           2       0.97      0.89      0.93        37
           3       0.97      1.00      0.98        29

    accuracy                           0.92       120
   macro avg       0.91      0.90      0.91       120
weighted avg       0.92      0.92      0.92       120

-------------------------
| ROC Curve |
-------------------------
------------------------------------
|   Decision Tree via. Plot_tree   |
------------------------------------
------------------------------------
|   Decision Tree via. graphviz   |
------------------------------------
Tree 0 PEG ≤ 0.415 entropy = 2.0 samples = 120 value = [30.0, 30.0, 30.0, 30.0] 1 PEG ≤ 0.135 entropy = 1.338 samples = 61 value = [30.0, 30.0, 5.676, 0.0] 0->1 True 28 LPR ≤ 0.135 entropy = 0.992 samples = 59 value = [0.0, 0.0, 24.324, 30.0] 0->28 False 2 LPR ≤ 0.665 entropy = 0.328 samples = 14 value = [24.0, 1.538, 0.0, 0.0] 1->2 7 PEG ≤ 0.255 entropy = 1.161 samples = 47 value = [6.0, 28.462, 5.676, 0.0] 1->7 3 entropy = 0.0 samples = 12 value = [24, 0, 0, 0] 2->3 4 STR ≤ 0.22 entropy = 0.0 samples = 2 value = [0.0, 1.538, 0.0, 0.0] 2->4 5 entropy = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 4->5 6 entropy = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 4->6 8 STG ≤ 0.35 entropy = 0.927 samples = 18 value = [6.0, 11.538, 0.0, 0.0] 7->8 19 LPR ≤ 0.555 entropy = 0.813 samples = 29 value = [0.0, 16.923, 5.676, 0.0] 7->19 9 PEG ≤ 0.195 entropy = 0.965 samples = 8 value = [6.0, 3.846, 0.0, 0.0] 8->9 16 STG ≤ 0.385 entropy = 0.0 samples = 10 value = [0.0, 7.692, 0.0, 0.0] 8->16 10 entropy = 0.0 samples = 2 value = [0.0, 1.538, 0.0, 0.0] 9->10 11 PEG ≤ 0.205 entropy = 0.852 samples = 6 value = [6.0, 2.308, 0.0, 0.0] 9->11 12 entropy = 0.0 samples = 1 value = [2, 0, 0, 0] 11->12 13 LPR ≤ 0.475 entropy = 0.947 samples = 5 value = [4.0, 2.308, 0.0, 0.0] 11->13 14 entropy = 0.0 samples = 2 value = [4, 0, 0, 0] 13->14 15 entropy = -0.0 samples = 3 value = [0.0, 2.308, 0.0, 0.0] 13->15 17 entropy = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 16->17 18 entropy = 0.0 samples = 9 value = [0.0, 6.923, 0.0, 0.0] 16->18 20 PEG ≤ 0.27 entropy = 0.0 samples = 21 value = [0.0, 16.154, 0.0, 0.0] 19->20 23 SCG ≤ 0.35 entropy = 0.528 samples = 8 value = [0.0, 0.769, 5.676, 0.0] 19->23 21 entropy = 0.0 samples = 2 value = [0.0, 1.538, 0.0, 0.0] 20->21 22 entropy = 0.0 samples = 19 value = [0.0, 14.615, 0.0, 0.0] 20->22 24 entropy = 0.0 samples = 5 value = [0.0, 0.0, 4.054, 0.0] 23->24 25 SCG ≤ 0.535 entropy = 0.906 samples = 3 value = [0.0, 0.769, 1.622, 0.0] 23->25 26 entropy = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 25->26 27 entropy = 0.0 samples = 2 value = [0.0, 0.0, 1.622, 0.0] 25->27 29 entropy = 0.0 samples = 7 value = [0.0, 0.0, 5.676, 0.0] 28->29 30 PEG ≤ 0.665 entropy = 0.96 samples = 52 value = [0.0, 0.0, 18.649, 30.0] 28->30 31 LPR ≤ 0.85 entropy = 0.469 samples = 25 value = [0.0, 0.0, 18.649, 2.069] 30->31 36 STR ≤ 0.125 entropy = 0.0 samples = 27 value = [0.0, 0.0, 0.0, 27.931] 30->36 32 entropy = 0.0 samples = 23 value = [0.0, 0.0, 18.649, 0.0] 31->32 33 PEG ≤ 0.61 entropy = 0.0 samples = 2 value = [0.0, 0.0, 0.0, 2.069] 31->33 34 entropy = 0.0 samples = 1 value = [0.0, 0.0, 0.0, 1.034] 33->34 35 entropy = 0.0 samples = 1 value = [0.0, 0.0, 0.0, 1.034] 33->35 37 entropy = 0.0 samples = 1 value = [0.0, 0.0, 0.0, 1.034] 36->37 38 entropy = 0.0 samples = 26 value = [0.0, 0.0, 0.0, 26.897] 36->38
--------------------------------------
|      Decision Tree Description     |
--------------------------------------
The binary tree structure has 39 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 4] <= 0.41499999165534973 else to node 28.
	node=1 is a split node: go to node 2 if X[:, 4] <= 0.13499999791383743 else to node 7.
		node=2 is a split node: go to node 3 if X[:, 3] <= 0.6649999916553497 else to node 4.
			node=3 is a leaf node.
			node=4 is a split node: go to node 5 if X[:, 2] <= 0.2199999950826168 else to node 6.
				node=5 is a leaf node.
				node=6 is a leaf node.
		node=7 is a split node: go to node 8 if X[:, 4] <= 0.2549999952316284 else to node 19.
			node=8 is a split node: go to node 9 if X[:, 0] <= 0.3499999940395355 else to node 16.
				node=9 is a split node: go to node 10 if X[:, 4] <= 0.19500000029802322 else to node 11.
					node=10 is a leaf node.
					node=11 is a split node: go to node 12 if X[:, 4] <= 0.20499999821186066 else to node 13.
						node=12 is a leaf node.
						node=13 is a split node: go to node 14 if X[:, 3] <= 0.4749999940395355 else to node 15.
							node=14 is a leaf node.
							node=15 is a leaf node.
				node=16 is a split node: go to node 17 if X[:, 0] <= 0.38499999046325684 else to node 18.
					node=17 is a leaf node.
					node=18 is a leaf node.
			node=19 is a split node: go to node 20 if X[:, 3] <= 0.5550000071525574 else to node 23.
				node=20 is a split node: go to node 21 if X[:, 4] <= 0.26999999582767487 else to node 22.
					node=21 is a leaf node.
					node=22 is a leaf node.
				node=23 is a split node: go to node 24 if X[:, 1] <= 0.3499999940395355 else to node 25.
					node=24 is a leaf node.
					node=25 is a split node: go to node 26 if X[:, 1] <= 0.5349999815225601 else to node 27.
						node=26 is a leaf node.
						node=27 is a leaf node.
	node=28 is a split node: go to node 29 if X[:, 3] <= 0.13499999791383743 else to node 30.
		node=29 is a leaf node.
		node=30 is a split node: go to node 31 if X[:, 4] <= 0.6650000214576721 else to node 36.
			node=31 is a split node: go to node 32 if X[:, 3] <= 0.8500000238418579 else to node 33.
				node=32 is a leaf node.
				node=33 is a split node: go to node 34 if X[:, 4] <= 0.6100000143051147 else to node 35.
					node=34 is a leaf node.
					node=35 is a leaf node.
			node=36 is a split node: go to node 37 if X[:, 2] <= 0.12500000186264515 else to node 38.
				node=37 is a leaf node.
				node=38 is a leaf node.

--------------------------------------
|     ccp_alphas for Post Pruning     |
--------------------------------------

[0.00000000e+00 1.94325505e-17 2.35305905e-16 3.04866340e-16
 5.62521852e-03 5.71558615e-03 6.11343912e-03 7.43356304e-03
 8.48661518e-03 8.63291106e-03 1.09367460e-02 1.12140546e-02
 1.32357191e-02 1.68540384e-02 2.95475235e-02 3.13389697e-02
 6.73639191e-02 9.76412209e-02 1.39081296e-01 2.58130803e-01
 3.69263963e-01 8.12292430e-01]
--------------------------
|      Best Estimator     |
--------------------------

	DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                       max_features='auto', random_state=0)

--------------------------
|     Best parameters     |
--------------------------
	Parameters of best estimator : 

	{'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 0, 'splitter': 'best'}

---------------------------------
|   No of CrossValidation sets   |
--------------------------------

	Total number of cross validation sets: 3

--------------------------
|        Best Score       |
--------------------------

	Average Cross Validate scores of best estimator : 

	0.9246922237805828

8.4 Calculate the best Decision Tree Model using GridSearchCV using Entopy Criterion that works on Information Gain with Balanced Data using SMOTE

In [1196]:
# start Grid search with Entropy Criterion
parameters = {'criterion': ['entropy'], 
              'splitter': ['best', 'random'], 
              'max_depth':[None],
              'min_samples_split':[2],
              'min_samples_leaf':[1],
              'min_weight_fraction_leaf':[0.0],
              'max_features':['auto', 'sqrt', 'log2'],
              'random_state':[0],
              'max_leaf_nodes':[None],
              'min_impurity_decrease':[0.0],
              'class_weight':[None,'balanced'],
              'ccp_alpha':[0.0]}
des_tree_entropy = DecisionTreeClassifier()
des_tree_entropy_grid = GridSearchCV(des_tree_entropy, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_entropy_grid_results = perform_model(des_tree_entropy_grid, X_train_smote, y_train_smote, X_test, y_test, labels)
# observe the attributes of the model 
print_grid_search_attributes(des_tree_entropy_grid_results)
training the model..
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Done 
 

---------------------
|   Training Time    |
---------------------
training_time(HH:MM:SS.ms) - 0:00:00.290833


Predicting test data
Done 
 

---------------------
|   Testing Time    |
---------------------
testing time(HH:MM:SS:ms) - 0:00:00.008995


---------------------
| Training Set Accuracy |
---------------------

    1.0


---------------------
| Testing Set Accuracy  |
---------------------

    0.9083333333333333


-----------------------------
| Train Set Confusion Matrix |
-----------------------------

 [[90  0  0  0]
 [ 0 90  0  0]
 [ 0  0 90  0]
 [ 0  0  0 90]]

-----------------------------
| Test Set Confusion Matrix |
-----------------------------

 [[12  3  0  0]
 [ 2 36  1  0]
 [ 0  3 33  1]
 [ 0  0  1 28]]
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
-------------------------
| Classification Report |
-------------------------
              precision    recall  f1-score   support

           0       0.86      0.80      0.83        15
           1       0.86      0.92      0.89        39
           2       0.94      0.89      0.92        37
           3       0.97      0.97      0.97        29

    accuracy                           0.91       120
   macro avg       0.91      0.90      0.90       120
weighted avg       0.91      0.91      0.91       120

-------------------------
| ROC Curve |
-------------------------
------------------------------------
|   Decision Tree via. Plot_tree   |
------------------------------------
------------------------------------
|   Decision Tree via. graphviz   |
------------------------------------
Tree 0 PEG ≤ 0.415 entropy = 1.921 samples = 120 value = [15, 39, 37, 29] 1 PEG ≤ 0.135 entropy = 1.269 samples = 61 value = [15, 39, 7, 0] 0->1 True 28 LPR ≤ 0.795 entropy = 1.0 samples = 59 value = [0, 0, 30, 29] 0->28 False 2 LPR ≤ 0.665 entropy = 0.592 samples = 14 value = [12, 2, 0, 0] 1->2 5 PEG ≤ 0.255 entropy = 0.934 samples = 47 value = [3, 37, 7, 0] 1->5 3 entropy = 0.0 samples = 12 value = [12, 0, 0, 0] 2->3 4 entropy = 0.0 samples = 2 value = [0, 2, 0, 0] 2->4 6 STR ≤ 0.305 entropy = 0.65 samples = 18 value = [3, 15, 0, 0] 5->6 19 LPR ≤ 0.555 entropy = 0.797 samples = 29 value = [0, 22, 7, 0] 5->19 7 STG ≤ 0.42 entropy = 0.918 samples = 6 value = [2, 4, 0, 0] 6->7 14 LPR ≤ 0.285 entropy = 0.414 samples = 12 value = [1, 11, 0, 0] 6->14 8 PEG ≤ 0.245 entropy = 0.918 samples = 3 value = [2, 1, 0, 0] 7->8 13 entropy = 0.0 samples = 3 value = [0, 3, 0, 0] 7->13 9 SCG ≤ 0.14 entropy = 1.0 samples = 2 value = [1, 1, 0, 0] 8->9 12 entropy = 0.0 samples = 1 value = [1, 0, 0, 0] 8->12 10 entropy = 0.0 samples = 1 value = [0, 1, 0, 0] 9->10 11 entropy = 0.0 samples = 1 value = [1, 0, 0, 0] 9->11 15 STG ≤ 0.412 entropy = 1.0 samples = 2 value = [1, 1, 0, 0] 14->15 18 entropy = 0.0 samples = 10 value = [0, 10, 0, 0] 14->18 16 entropy = 0.0 samples = 1 value = [1, 0, 0, 0] 15->16 17 entropy = 0.0 samples = 1 value = [0, 1, 0, 0] 15->17 20 entropy = 0.0 samples = 21 value = [0, 21, 0, 0] 19->20 21 SCG ≤ 0.35 entropy = 0.544 samples = 8 value = [0, 1, 7, 0] 19->21 22 entropy = 0.0 samples = 5 value = [0, 0, 5, 0] 21->22 23 LPR ≤ 0.665 entropy = 0.918 samples = 3 value = [0, 1, 2, 0] 21->23 24 entropy = 0.0 samples = 1 value = [0, 0, 1, 0] 23->24 25 PEG ≤ 0.275 entropy = 1.0 samples = 2 value = [0, 1, 1, 0] 23->25 26 entropy = 0.0 samples = 1 value = [0, 1, 0, 0] 25->26 27 entropy = 0.0 samples = 1 value = [0, 0, 1, 0] 25->27 29 PEG ≤ 0.695 entropy = 0.983 samples = 52 value = [0, 0, 30, 22] 28->29 32 entropy = 0.0 samples = 7 value = [0, 0, 0, 7] 28->32 30 entropy = 0.0 samples = 30 value = [0, 0, 30, 0] 29->30 31 entropy = 0.0 samples = 22 value = [0, 0, 0, 22] 29->31
--------------------------------------
|      Decision Tree Description     |
--------------------------------------
The binary tree structure has 33 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 4] <= 0.41499999165534973 else to node 28.
	node=1 is a split node: go to node 2 if X[:, 4] <= 0.13499999791383743 else to node 5.
		node=2 is a split node: go to node 3 if X[:, 3] <= 0.6649999916553497 else to node 4.
			node=3 is a leaf node.
			node=4 is a leaf node.
		node=5 is a split node: go to node 6 if X[:, 4] <= 0.2549999952316284 else to node 19.
			node=6 is a split node: go to node 7 if X[:, 2] <= 0.3050000071525574 else to node 14.
				node=7 is a split node: go to node 8 if X[:, 0] <= 0.41999998688697815 else to node 13.
					node=8 is a split node: go to node 9 if X[:, 4] <= 0.24499999731779099 else to node 12.
						node=9 is a split node: go to node 10 if X[:, 1] <= 0.14000000059604645 else to node 11.
							node=10 is a leaf node.
							node=11 is a leaf node.
						node=12 is a leaf node.
					node=13 is a leaf node.
				node=14 is a split node: go to node 15 if X[:, 3] <= 0.2849999964237213 else to node 18.
					node=15 is a split node: go to node 16 if X[:, 0] <= 0.4124999940395355 else to node 17.
						node=16 is a leaf node.
						node=17 is a leaf node.
					node=18 is a leaf node.
			node=19 is a split node: go to node 20 if X[:, 3] <= 0.5550000071525574 else to node 21.
				node=20 is a leaf node.
				node=21 is a split node: go to node 22 if X[:, 1] <= 0.3499999940395355 else to node 23.
					node=22 is a leaf node.
					node=23 is a split node: go to node 24 if X[:, 3] <= 0.6650000214576721 else to node 25.
						node=24 is a leaf node.
						node=25 is a split node: go to node 26 if X[:, 4] <= 0.2749999910593033 else to node 27.
							node=26 is a leaf node.
							node=27 is a leaf node.
	node=28 is a split node: go to node 29 if X[:, 3] <= 0.7950000166893005 else to node 32.
		node=29 is a split node: go to node 30 if X[:, 4] <= 0.6949999928474426 else to node 31.
			node=30 is a leaf node.
			node=31 is a leaf node.
		node=32 is a leaf node.

--------------------------------------
|     ccp_alphas for Post Pruning     |
--------------------------------------

[0.         0.00631268 0.00765247 0.0090142  0.01053307 0.01083371
 0.01111111 0.01125547 0.01258146 0.01771874 0.02094643 0.02599923
 0.02899021 0.03215664 0.06407498 0.09761783 0.18459135 0.35301946
 0.80604746]
--------------------------
|      Best Estimator     |
--------------------------

	DecisionTreeClassifier(criterion='entropy', max_features='auto', random_state=0)

--------------------------
|     Best parameters     |
--------------------------
	Parameters of best estimator : 

	{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'entropy', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 0, 'splitter': 'best'}

---------------------------------
|   No of CrossValidation sets   |
--------------------------------

	Total number of cross validation sets: 3

--------------------------
|        Best Score       |
--------------------------

	Average Cross Validate scores of best estimator : 

	0.9222222222222222

8.5 Calculate the best Decision Tree Model using GridSearchCV using Log_Loss Criterion that works on Information Gain with Original Data

In [1197]:
# start Grid search with Log_Loss Criterion
parameters = {'criterion': ['log_loss'], 
              'splitter': ['best', 'random'], 
              'max_depth':[None],
              'min_samples_split':[2],
              'min_samples_leaf':[1],
              'min_weight_fraction_leaf':[0.0],
              'max_features':['auto', 'sqrt', 'log2'],
              'random_state':[0],
              'max_leaf_nodes':[None],
              'min_impurity_decrease':[0.0],
              'class_weight':[None,'balanced'],
              'ccp_alpha':[0.0]}
des_tree_logloss = DecisionTreeClassifier()
des_tree_logloss_grid = GridSearchCV(des_tree_logloss, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_logloss_grid_results = perform_model(des_tree_logloss_grid, X_train, y_train, X_test, y_test, labels)
# observe the attributes of the model 
print_grid_search_attributes(des_tree_logloss_grid_results)
training the model..
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Done 
 

---------------------
|   Training Time    |
---------------------
training_time(HH:MM:SS.ms) - 0:00:00.299831


Predicting test data
Done 
 

---------------------
|   Testing Time    |
---------------------
testing time(HH:MM:SS:ms) - 0:00:00.007997


---------------------
| Training Set Accuracy |
---------------------

    1.0


---------------------
| Testing Set Accuracy  |
---------------------

    0.9166666666666666


-----------------------------
| Train Set Confusion Matrix |
-----------------------------

 [[35  0  0  0]
 [ 0 90  0  0]
 [ 0  0 85  0]
 [ 0  0  0 68]]

-----------------------------
| Test Set Confusion Matrix |
-----------------------------

 [[12  3  0  0]
 [ 2 36  1  0]
 [ 0  3 33  1]
 [ 0  0  0 29]]
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
-------------------------
| Classification Report |
-------------------------
              precision    recall  f1-score   support

           0       0.86      0.80      0.83        15
           1       0.86      0.92      0.89        39
           2       0.97      0.89      0.93        37
           3       0.97      1.00      0.98        29

    accuracy                           0.92       120
   macro avg       0.91      0.90      0.91       120
weighted avg       0.92      0.92      0.92       120

-------------------------
| ROC Curve |
-------------------------
------------------------------------
|   Decision Tree via. Plot_tree   |
------------------------------------
------------------------------------
|   Decision Tree via. graphviz   |
------------------------------------
Tree 0 PEG ≤ 0.415 log_loss = 2.0 samples = 120 value = [30.0, 30.0, 30.0, 30.0] 1 PEG ≤ 0.135 log_loss = 1.338 samples = 61 value = [30.0, 30.0, 5.676, 0.0] 0->1 True 28 LPR ≤ 0.135 log_loss = 0.992 samples = 59 value = [0.0, 0.0, 24.324, 30.0] 0->28 False 2 LPR ≤ 0.665 log_loss = 0.328 samples = 14 value = [24.0, 1.538, 0.0, 0.0] 1->2 7 PEG ≤ 0.255 log_loss = 1.161 samples = 47 value = [6.0, 28.462, 5.676, 0.0] 1->7 3 log_loss = 0.0 samples = 12 value = [24, 0, 0, 0] 2->3 4 STR ≤ 0.22 log_loss = 0.0 samples = 2 value = [0.0, 1.538, 0.0, 0.0] 2->4 5 log_loss = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 4->5 6 log_loss = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 4->6 8 STG ≤ 0.35 log_loss = 0.927 samples = 18 value = [6.0, 11.538, 0.0, 0.0] 7->8 19 LPR ≤ 0.555 log_loss = 0.813 samples = 29 value = [0.0, 16.923, 5.676, 0.0] 7->19 9 PEG ≤ 0.195 log_loss = 0.965 samples = 8 value = [6.0, 3.846, 0.0, 0.0] 8->9 16 STG ≤ 0.385 log_loss = 0.0 samples = 10 value = [0.0, 7.692, 0.0, 0.0] 8->16 10 log_loss = 0.0 samples = 2 value = [0.0, 1.538, 0.0, 0.0] 9->10 11 PEG ≤ 0.205 log_loss = 0.852 samples = 6 value = [6.0, 2.308, 0.0, 0.0] 9->11 12 log_loss = 0.0 samples = 1 value = [2, 0, 0, 0] 11->12 13 LPR ≤ 0.475 log_loss = 0.947 samples = 5 value = [4.0, 2.308, 0.0, 0.0] 11->13 14 log_loss = 0.0 samples = 2 value = [4, 0, 0, 0] 13->14 15 log_loss = -0.0 samples = 3 value = [0.0, 2.308, 0.0, 0.0] 13->15 17 log_loss = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 16->17 18 log_loss = 0.0 samples = 9 value = [0.0, 6.923, 0.0, 0.0] 16->18 20 PEG ≤ 0.27 log_loss = 0.0 samples = 21 value = [0.0, 16.154, 0.0, 0.0] 19->20 23 SCG ≤ 0.35 log_loss = 0.528 samples = 8 value = [0.0, 0.769, 5.676, 0.0] 19->23 21 log_loss = 0.0 samples = 2 value = [0.0, 1.538, 0.0, 0.0] 20->21 22 log_loss = 0.0 samples = 19 value = [0.0, 14.615, 0.0, 0.0] 20->22 24 log_loss = 0.0 samples = 5 value = [0.0, 0.0, 4.054, 0.0] 23->24 25 SCG ≤ 0.535 log_loss = 0.906 samples = 3 value = [0.0, 0.769, 1.622, 0.0] 23->25 26 log_loss = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 25->26 27 log_loss = 0.0 samples = 2 value = [0.0, 0.0, 1.622, 0.0] 25->27 29 log_loss = 0.0 samples = 7 value = [0.0, 0.0, 5.676, 0.0] 28->29 30 PEG ≤ 0.665 log_loss = 0.96 samples = 52 value = [0.0, 0.0, 18.649, 30.0] 28->30 31 LPR ≤ 0.85 log_loss = 0.469 samples = 25 value = [0.0, 0.0, 18.649, 2.069] 30->31 36 STR ≤ 0.125 log_loss = 0.0 samples = 27 value = [0.0, 0.0, 0.0, 27.931] 30->36 32 log_loss = 0.0 samples = 23 value = [0.0, 0.0, 18.649, 0.0] 31->32 33 PEG ≤ 0.61 log_loss = 0.0 samples = 2 value = [0.0, 0.0, 0.0, 2.069] 31->33 34 log_loss = 0.0 samples = 1 value = [0.0, 0.0, 0.0, 1.034] 33->34 35 log_loss = 0.0 samples = 1 value = [0.0, 0.0, 0.0, 1.034] 33->35 37 log_loss = 0.0 samples = 1 value = [0.0, 0.0, 0.0, 1.034] 36->37 38 log_loss = 0.0 samples = 26 value = [0.0, 0.0, 0.0, 26.897] 36->38
--------------------------------------
|      Decision Tree Description     |
--------------------------------------
The binary tree structure has 39 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 4] <= 0.41499999165534973 else to node 28.
	node=1 is a split node: go to node 2 if X[:, 4] <= 0.13499999791383743 else to node 7.
		node=2 is a split node: go to node 3 if X[:, 3] <= 0.6649999916553497 else to node 4.
			node=3 is a leaf node.
			node=4 is a split node: go to node 5 if X[:, 2] <= 0.2199999950826168 else to node 6.
				node=5 is a leaf node.
				node=6 is a leaf node.
		node=7 is a split node: go to node 8 if X[:, 4] <= 0.2549999952316284 else to node 19.
			node=8 is a split node: go to node 9 if X[:, 0] <= 0.3499999940395355 else to node 16.
				node=9 is a split node: go to node 10 if X[:, 4] <= 0.19500000029802322 else to node 11.
					node=10 is a leaf node.
					node=11 is a split node: go to node 12 if X[:, 4] <= 0.20499999821186066 else to node 13.
						node=12 is a leaf node.
						node=13 is a split node: go to node 14 if X[:, 3] <= 0.4749999940395355 else to node 15.
							node=14 is a leaf node.
							node=15 is a leaf node.
				node=16 is a split node: go to node 17 if X[:, 0] <= 0.38499999046325684 else to node 18.
					node=17 is a leaf node.
					node=18 is a leaf node.
			node=19 is a split node: go to node 20 if X[:, 3] <= 0.5550000071525574 else to node 23.
				node=20 is a split node: go to node 21 if X[:, 4] <= 0.26999999582767487 else to node 22.
					node=21 is a leaf node.
					node=22 is a leaf node.
				node=23 is a split node: go to node 24 if X[:, 1] <= 0.3499999940395355 else to node 25.
					node=24 is a leaf node.
					node=25 is a split node: go to node 26 if X[:, 1] <= 0.5349999815225601 else to node 27.
						node=26 is a leaf node.
						node=27 is a leaf node.
	node=28 is a split node: go to node 29 if X[:, 3] <= 0.13499999791383743 else to node 30.
		node=29 is a leaf node.
		node=30 is a split node: go to node 31 if X[:, 4] <= 0.6650000214576721 else to node 36.
			node=31 is a split node: go to node 32 if X[:, 3] <= 0.8500000238418579 else to node 33.
				node=32 is a leaf node.
				node=33 is a split node: go to node 34 if X[:, 4] <= 0.6100000143051147 else to node 35.
					node=34 is a leaf node.
					node=35 is a leaf node.
			node=36 is a split node: go to node 37 if X[:, 2] <= 0.12500000186264515 else to node 38.
				node=37 is a leaf node.
				node=38 is a leaf node.

--------------------------------------
|     ccp_alphas for Post Pruning     |
--------------------------------------

[0.00000000e+00 1.94325505e-17 2.35305905e-16 3.04866340e-16
 5.62521852e-03 5.71558615e-03 6.11343912e-03 7.43356304e-03
 8.48661518e-03 8.63291106e-03 1.09367460e-02 1.12140546e-02
 1.32357191e-02 1.68540384e-02 2.95475235e-02 3.13389697e-02
 6.73639191e-02 9.76412209e-02 1.39081296e-01 2.58130803e-01
 3.69263963e-01 8.12292430e-01]
--------------------------
|      Best Estimator     |
--------------------------

	DecisionTreeClassifier(class_weight='balanced', criterion='log_loss',
                       max_features='auto', random_state=0)

--------------------------
|     Best parameters     |
--------------------------
	Parameters of best estimator : 

	{'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'log_loss', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 0, 'splitter': 'best'}

---------------------------------
|   No of CrossValidation sets   |
--------------------------------

	Total number of cross validation sets: 3

--------------------------
|        Best Score       |
--------------------------

	Average Cross Validate scores of best estimator : 

	0.9246922237805828

8.6 Calculate the best Decision Tree Model using GridSearchCV using Log_Loss Criterion that works on Information Gain with Balanced Data using SMOTE

In [1198]:
# start Grid search with Log_Loss Criterion
parameters = {'criterion': ['log_loss'], 
              'splitter': ['best', 'random'], 
              'max_depth':[None],
              'min_samples_split':[2],
              'min_samples_leaf':[1],
              'min_weight_fraction_leaf':[0.0],
              'max_features':['auto', 'sqrt', 'log2'],
              'random_state':[0],
              'max_leaf_nodes':[None],
              'min_impurity_decrease':[0.0],
              'class_weight':[None,'balanced'],
              'ccp_alpha':[0.0]}
des_tree_logloss = DecisionTreeClassifier()
des_tree_logloss_grid = GridSearchCV(des_tree_logloss, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_logloss_grid_results = perform_model(des_tree_logloss_grid, X_train_smote, y_train_smote, X_test, y_test, labels)
# observe the attributes of the model 
print_grid_search_attributes(des_tree_logloss_grid_results)
training the model..
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Done 
 

---------------------
|   Training Time    |
---------------------
training_time(HH:MM:SS.ms) - 0:00:00.330811


Predicting test data
Done 
 

---------------------
|   Testing Time    |
---------------------
testing time(HH:MM:SS:ms) - 0:00:00.004999


---------------------
| Training Set Accuracy |
---------------------

    1.0


---------------------
| Testing Set Accuracy  |
---------------------

    0.9083333333333333


-----------------------------
| Train Set Confusion Matrix |
-----------------------------

 [[90  0  0  0]
 [ 0 90  0  0]
 [ 0  0 90  0]
 [ 0  0  0 90]]

-----------------------------
| Test Set Confusion Matrix |
-----------------------------

 [[12  3  0  0]
 [ 2 36  1  0]
 [ 0  3 33  1]
 [ 0  0  1 28]]
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
-------------------------
| Classification Report |
-------------------------
              precision    recall  f1-score   support

           0       0.86      0.80      0.83        15
           1       0.86      0.92      0.89        39
           2       0.94      0.89      0.92        37
           3       0.97      0.97      0.97        29

    accuracy                           0.91       120
   macro avg       0.91      0.90      0.90       120
weighted avg       0.91      0.91      0.91       120

-------------------------
| ROC Curve |
-------------------------
------------------------------------
|   Decision Tree via. Plot_tree   |
------------------------------------
------------------------------------
|   Decision Tree via. graphviz   |
------------------------------------
Tree 0 PEG ≤ 0.415 log_loss = 1.921 samples = 120 value = [15, 39, 37, 29] 1 PEG ≤ 0.135 log_loss = 1.269 samples = 61 value = [15, 39, 7, 0] 0->1 True 28 LPR ≤ 0.795 log_loss = 1.0 samples = 59 value = [0, 0, 30, 29] 0->28 False 2 LPR ≤ 0.665 log_loss = 0.592 samples = 14 value = [12, 2, 0, 0] 1->2 5 PEG ≤ 0.255 log_loss = 0.934 samples = 47 value = [3, 37, 7, 0] 1->5 3 log_loss = 0.0 samples = 12 value = [12, 0, 0, 0] 2->3 4 log_loss = 0.0 samples = 2 value = [0, 2, 0, 0] 2->4 6 STR ≤ 0.305 log_loss = 0.65 samples = 18 value = [3, 15, 0, 0] 5->6 19 LPR ≤ 0.555 log_loss = 0.797 samples = 29 value = [0, 22, 7, 0] 5->19 7 STG ≤ 0.42 log_loss = 0.918 samples = 6 value = [2, 4, 0, 0] 6->7 14 LPR ≤ 0.285 log_loss = 0.414 samples = 12 value = [1, 11, 0, 0] 6->14 8 PEG ≤ 0.245 log_loss = 0.918 samples = 3 value = [2, 1, 0, 0] 7->8 13 log_loss = 0.0 samples = 3 value = [0, 3, 0, 0] 7->13 9 SCG ≤ 0.14 log_loss = 1.0 samples = 2 value = [1, 1, 0, 0] 8->9 12 log_loss = 0.0 samples = 1 value = [1, 0, 0, 0] 8->12 10 log_loss = 0.0 samples = 1 value = [0, 1, 0, 0] 9->10 11 log_loss = 0.0 samples = 1 value = [1, 0, 0, 0] 9->11 15 STG ≤ 0.412 log_loss = 1.0 samples = 2 value = [1, 1, 0, 0] 14->15 18 log_loss = 0.0 samples = 10 value = [0, 10, 0, 0] 14->18 16 log_loss = 0.0 samples = 1 value = [1, 0, 0, 0] 15->16 17 log_loss = 0.0 samples = 1 value = [0, 1, 0, 0] 15->17 20 log_loss = 0.0 samples = 21 value = [0, 21, 0, 0] 19->20 21 SCG ≤ 0.35 log_loss = 0.544 samples = 8 value = [0, 1, 7, 0] 19->21 22 log_loss = 0.0 samples = 5 value = [0, 0, 5, 0] 21->22 23 LPR ≤ 0.665 log_loss = 0.918 samples = 3 value = [0, 1, 2, 0] 21->23 24 log_loss = 0.0 samples = 1 value = [0, 0, 1, 0] 23->24 25 PEG ≤ 0.275 log_loss = 1.0 samples = 2 value = [0, 1, 1, 0] 23->25 26 log_loss = 0.0 samples = 1 value = [0, 1, 0, 0] 25->26 27 log_loss = 0.0 samples = 1 value = [0, 0, 1, 0] 25->27 29 PEG ≤ 0.695 log_loss = 0.983 samples = 52 value = [0, 0, 30, 22] 28->29 32 log_loss = 0.0 samples = 7 value = [0, 0, 0, 7] 28->32 30 log_loss = 0.0 samples = 30 value = [0, 0, 30, 0] 29->30 31 log_loss = 0.0 samples = 22 value = [0, 0, 0, 22] 29->31
--------------------------------------
|      Decision Tree Description     |
--------------------------------------
The binary tree structure has 33 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 4] <= 0.41499999165534973 else to node 28.
	node=1 is a split node: go to node 2 if X[:, 4] <= 0.13499999791383743 else to node 5.
		node=2 is a split node: go to node 3 if X[:, 3] <= 0.6649999916553497 else to node 4.
			node=3 is a leaf node.
			node=4 is a leaf node.
		node=5 is a split node: go to node 6 if X[:, 4] <= 0.2549999952316284 else to node 19.
			node=6 is a split node: go to node 7 if X[:, 2] <= 0.3050000071525574 else to node 14.
				node=7 is a split node: go to node 8 if X[:, 0] <= 0.41999998688697815 else to node 13.
					node=8 is a split node: go to node 9 if X[:, 4] <= 0.24499999731779099 else to node 12.
						node=9 is a split node: go to node 10 if X[:, 1] <= 0.14000000059604645 else to node 11.
							node=10 is a leaf node.
							node=11 is a leaf node.
						node=12 is a leaf node.
					node=13 is a leaf node.
				node=14 is a split node: go to node 15 if X[:, 3] <= 0.2849999964237213 else to node 18.
					node=15 is a split node: go to node 16 if X[:, 0] <= 0.4124999940395355 else to node 17.
						node=16 is a leaf node.
						node=17 is a leaf node.
					node=18 is a leaf node.
			node=19 is a split node: go to node 20 if X[:, 3] <= 0.5550000071525574 else to node 21.
				node=20 is a leaf node.
				node=21 is a split node: go to node 22 if X[:, 1] <= 0.3499999940395355 else to node 23.
					node=22 is a leaf node.
					node=23 is a split node: go to node 24 if X[:, 3] <= 0.6650000214576721 else to node 25.
						node=24 is a leaf node.
						node=25 is a split node: go to node 26 if X[:, 4] <= 0.2749999910593033 else to node 27.
							node=26 is a leaf node.
							node=27 is a leaf node.
	node=28 is a split node: go to node 29 if X[:, 3] <= 0.7950000166893005 else to node 32.
		node=29 is a split node: go to node 30 if X[:, 4] <= 0.6949999928474426 else to node 31.
			node=30 is a leaf node.
			node=31 is a leaf node.
		node=32 is a leaf node.

--------------------------------------
|     ccp_alphas for Post Pruning     |
--------------------------------------

[0.         0.00631268 0.00765247 0.0090142  0.01053307 0.01083371
 0.01111111 0.01125547 0.01258146 0.01771874 0.02094643 0.02599923
 0.02899021 0.03215664 0.06407498 0.09761783 0.18459135 0.35301946
 0.80604746]
--------------------------
|      Best Estimator     |
--------------------------

	DecisionTreeClassifier(criterion='log_loss', max_features='auto',
                       random_state=0)

--------------------------
|     Best parameters     |
--------------------------
	Parameters of best estimator : 

	{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'log_loss', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 0, 'splitter': 'best'}

---------------------------------
|   No of CrossValidation sets   |
--------------------------------

	Total number of cross validation sets: 3

--------------------------
|        Best Score       |
--------------------------

	Average Cross Validate scores of best estimator : 

	0.9222222222222222

8.7 Compare performance of models

In [1199]:
for item in Feature_Subset_Result:
    dFFeature_Subset_Result.loc[dFFeature_Subset_Result.shape[0]] = item
In [1200]:
print(dFFeature_Subset_Result[['Best Estimator','Training Set Accuracy', 'Testing Set Accuracy', 'Training Time', 'Testing Time',
                                                'Avg. Cross Validation Score of Best Estimator']].to_markdown(tablefmt="pretty"))
+---+----------------------------------------------------------------------------------+-----------------------+----------------------+------------------------+------------------------+-----------------------------------------------+
|   |                                  Best Estimator                                  | Training Set Accuracy | Testing Set Accuracy |     Training Time      |      Testing Time      | Avg. Cross Validation Score of Best Estimator |
+---+----------------------------------------------------------------------------------+-----------------------+----------------------+------------------------+------------------------+-----------------------------------------------+
| 0 |       DecisionTreeClassifier(class_weight='balanced', max_features='auto',       |          1.0          |  0.8833333333333333  | 0 days 00:00:08.519116 | 0 days 00:00:00.006995 |               0.906459404706249               |
|   |                                             random_state=0)                      |                       |                      |                        |                        |                                               |
| 1 |           DecisionTreeClassifier(max_features='auto', random_state=0)            |          1.0          |  0.8916666666666667  | 0 days 00:00:00.327816 | 0 days 00:00:00.006993 |              0.9277777777777777               |
| 2 |                                     chef.fit                                     |        0.7086         |        0.650         | 0 days 00:00:15.182259 | 0 days 00:00:00.094945 |                                               |
| 3 |                                     chef.fit                                     |        0.6055         |        0.6583        | 0 days 00:00:14.784230 | 0 days 00:00:00.056966 |                                               |
| 4 |       DecisionTreeClassifier(class_weight='balanced', criterion='entropy',       |          1.0          |  0.9166666666666666  | 0 days 00:00:00.241866 | 0 days 00:00:00.004998 |              0.9246922237805828               |
|   |                                  max_features='auto', random_state=0)            |                       |                      |                        |                        |                                               |
| 5 | DecisionTreeClassifier(criterion='entropy', max_features='auto', random_state=0) |          1.0          |  0.9083333333333333  | 0 days 00:00:00.290833 | 0 days 00:00:00.008995 |              0.9222222222222222               |
| 6 |      DecisionTreeClassifier(class_weight='balanced', criterion='log_loss',       |          1.0          |  0.9166666666666666  | 0 days 00:00:00.299831 | 0 days 00:00:00.007997 |              0.9246922237805828               |
|   |                                  max_features='auto', random_state=0)            |                       |                      |                        |                        |                                               |
| 7 |        DecisionTreeClassifier(criterion='log_loss', max_features='auto',         |          1.0          |  0.9083333333333333  | 0 days 00:00:00.330811 | 0 days 00:00:00.004999 |              0.9222222222222222               |
|   |                                             random_state=0)                      |                       |                      |                        |                        |                                               |
+---+----------------------------------------------------------------------------------+-----------------------+----------------------+------------------------+------------------------+-----------------------------------------------+

Conclusion: There is overfitting in the models as Testing Set Accuracy is lesser than Training Set Accuracy. This needs to be corrected using Pre-Pruning and Post-Pruning.

A) Best scores using Original and Balance Data before any pruning

  • Best Scores with Gini Index using Gini @ DecisionTreeClassifier
    • Testing Set Accuracy: 0.8916
  • Best Scores with Gain Ratio using ChefBoost
    • Testing Set Accuracy: 0.6583
  • Best Scores with Information Gain using Log_Loss @ DecisionTreeClassifier
    • Testing Set Accuracy: 0.9166

9.0 Q-6: Print confusion matrix and classification report before and after pruning the Decision tree. Write down your observations on the effects of pruning. - (1+1)Marks

9.1 Pre-Pruning Method

Pre pruning is nothing but stoping the growth of decision tree on an early stage. For that we can limit the growth of trees by setting constrains. We can limit parameters like max_depth , min_samples etc.

An effective way to do is that we can grid search those parameters and choose the optimum values that gives better performace on test data.

As of now we will control these parameters

  • max_depth: maximum depth of decision tree
  • min_sample_split: The minimum number of samples required to split an internal node:
  • min_samples_leaf: The minimum number of samples required to be at a leaf node.

9.1.1 Pre-Pruning Method: Calculate the best Decision Tree Model using GridSearchCV using Gini Criterion with Original Data

In [1201]:
# start Grid search with Gini Criterion
parameters = {'criterion': ['gini'], 
              'splitter': ['best', 'random'], 
              'max_depth':[2,4,6,8,10,12],
              'min_samples_split':[2,3,4],
              'min_samples_leaf':[1,2],
              'min_weight_fraction_leaf':[0.0],
              'max_features':['auto', 'sqrt', 'log2'],
              'random_state':[0],
              'max_leaf_nodes':[None],
              'min_impurity_decrease':[0.0],
              'class_weight':[None,'balanced'],
              'ccp_alpha':[0.0]} #for pruning use [0.0, 0.1, 0.01, 0.001]

des_tree_gini_prepruning = DecisionTreeClassifier()
des_tree_gini_grid_prepruning = GridSearchCV(des_tree_gini_prepruning, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_gini_grid_results_prepruning = perform_model(des_tree_gini_grid_prepruning, X_train, y_train, X_test, y_test, labels)
# observe the attributes of the model 
print_grid_search_attributes(des_tree_gini_grid_results_prepruning)
training the model..
Fitting 3 folds for each of 432 candidates, totalling 1296 fits
Done 
 

---------------------
|   Training Time    |
---------------------
training_time(HH:MM:SS.ms) - 0:00:05.274972


Predicting test data
Done 
 

---------------------
|   Testing Time    |
---------------------
testing time(HH:MM:SS:ms) - 0:00:00.004999


---------------------
| Training Set Accuracy |
---------------------

    0.9928057553956835


---------------------
| Testing Set Accuracy  |
---------------------

    0.925


-----------------------------
| Train Set Confusion Matrix |
-----------------------------

 [[35  0  0  0]
 [ 0 90  0  0]
 [ 0  2 83  0]
 [ 0  0  0 68]]

-----------------------------
| Test Set Confusion Matrix |
-----------------------------

 [[12  3  0  0]
 [ 2 37  0  0]
 [ 0  3 33  1]
 [ 0  0  0 29]]
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
-------------------------
| Classification Report |
-------------------------
              precision    recall  f1-score   support

           0       0.86      0.80      0.83        15
           1       0.86      0.95      0.90        39
           2       1.00      0.89      0.94        37
           3       0.97      1.00      0.98        29

    accuracy                           0.93       120
   macro avg       0.92      0.91      0.91       120
weighted avg       0.93      0.93      0.93       120

-------------------------
| ROC Curve |
-------------------------
------------------------------------
|   Decision Tree via. Plot_tree   |
------------------------------------
------------------------------------
|   Decision Tree via. graphviz   |
------------------------------------
Tree 0 PEG ≤ 0.69 gini = 0.75 samples = 120 value = [30.0, 30.0, 30.0, 30.0] 1 PEG ≤ 0.335 gini = 0.687 samples = 94 value = [30.0, 30.0, 30.0, 3.103] 0->1 True 44 gini = -0.0 samples = 26 value = [0.0, 0.0, 0.0, 26.897] 0->44 False 2 LPR ≤ 0.265 gini = 0.567 samples = 59 value = [30.0, 29.231, 4.865, 0.0] 1->2 33 PEG ≤ 0.655 gini = 0.237 samples = 35 value = [0.0, 0.769, 25.135, 3.103] 1->33 3 PEG ≤ 0.27 gini = 0.316 samples = 18 value = [22.0, 5.385, 0.0, 0.0] 2->3 12 PEG ≤ 0.13 gini = 0.513 samples = 41 value = [8.0, 23.846, 4.865, 0.0] 2->12 4 PEG ≤ 0.22 gini = 0.065 samples = 12 value = [22.0, 0.769, 0.0, 0.0] 3->4 9 PEG ≤ 0.3 gini = 0.0 samples = 6 value = [0.0, 4.615, 0.0, 0.0] 3->9 5 gini = 0.0 samples = 9 value = [18, 0, 0, 0] 4->5 6 STG ≤ 0.45 gini = 0.271 samples = 3 value = [4.0, 0.769, 0.0, 0.0] 4->6 7 gini = 0.0 samples = 2 value = [4, 0, 0, 0] 6->7 8 gini = -0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 6->8 10 gini = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 9->10 11 gini = 0.0 samples = 5 value = [0.0, 3.846, 0.0, 0.0] 9->11 13 LPR ≤ 0.665 gini = 0.271 samples = 6 value = [8.0, 1.538, 0.0, 0.0] 12->13 18 LPR ≤ 0.79 gini = 0.294 samples = 35 value = [0.0, 22.308, 4.865, 0.0] 12->18 14 gini = 0.0 samples = 4 value = [8, 0, 0, 0] 13->14 15 STG ≤ 0.23 gini = 0.0 samples = 2 value = [0.0, 1.538, 0.0, 0.0] 13->15 16 gini = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 15->16 17 gini = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 15->17 19 STR ≤ 0.805 gini = 0.182 samples = 31 value = [0.0, 21.538, 2.432, 0.0] 18->19 30 LPR ≤ 0.96 gini = 0.365 samples = 4 value = [0.0, 0.769, 2.432, 0.0] 18->30 20 LPR ≤ 0.64 gini = 0.134 samples = 29 value = [0.0, 20.769, 1.622, 0.0] 19->20 27 SCG ≤ 0.585 gini = 0.5 samples = 2 value = [0.0, 0.769, 0.811, 0.0] 19->27 21 SCG ≤ 0.055 gini = 0.0 samples = 22 value = [0.0, 16.923, 0.0, 0.0] 20->21 24 LPR ≤ 0.705 gini = 0.417 samples = 7 value = [0.0, 3.846, 1.622, 0.0] 20->24 22 gini = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 21->22 23 gini = 0.0 samples = 21 value = [0.0, 16.154, 0.0, 0.0] 21->23 25 gini = 0.436 samples = 3 value = [0.0, 0.769, 1.622, 0.0] 24->25 26 gini = -0.0 samples = 4 value = [0.0, 3.077, 0.0, 0.0] 24->26 28 gini = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 27->28 29 gini = 0.0 samples = 1 value = [0.0, 0.0, 0.811, 0.0] 27->29 31 gini = 0.0 samples = 3 value = [0.0, 0.0, 2.432, 0.0] 30->31 32 gini = -0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 30->32 34 PEG ≤ 0.415 gini = 0.148 samples = 28 value = [0.0, 0.769, 21.081, 1.034] 33->34 41 LPR ≤ 0.57 gini = 0.447 samples = 7 value = [0.0, 0.0, 4.054, 2.069] 33->41 35 LPR ≤ 0.54 gini = 0.5 samples = 2 value = [0.0, 0.769, 0.811, 0.0] 34->35 38 STG ≤ 0.09 gini = 0.092 samples = 26 value = [0.0, 0.0, 20.27, 1.034] 34->38 36 gini = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 35->36 37 gini = 0.0 samples = 1 value = [0.0, 0.0, 0.811, 0.0] 35->37 39 gini = 0.0 samples = 1 value = [0.0, 0.0, 0.0, 1.034] 38->39 40 gini = 0.0 samples = 25 value = [0.0, 0.0, 20.27, 0.0] 38->40 42 gini = 0.0 samples = 5 value = [0.0, 0.0, 4.054, 0.0] 41->42 43 gini = -0.0 samples = 2 value = [0.0, 0.0, 0.0, 2.069] 41->43
--------------------------------------
|      Decision Tree Description     |
--------------------------------------
The binary tree structure has 45 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 4] <= 0.6899999976158142 else to node 44.
	node=1 is a split node: go to node 2 if X[:, 4] <= 0.33500000834465027 else to node 33.
		node=2 is a split node: go to node 3 if X[:, 3] <= 0.26500000059604645 else to node 12.
			node=3 is a split node: go to node 4 if X[:, 4] <= 0.26999999582767487 else to node 9.
				node=4 is a split node: go to node 5 if X[:, 4] <= 0.2199999988079071 else to node 6.
					node=5 is a leaf node.
					node=6 is a split node: go to node 7 if X[:, 0] <= 0.44999998807907104 else to node 8.
						node=7 is a leaf node.
						node=8 is a leaf node.
				node=9 is a split node: go to node 10 if X[:, 4] <= 0.29999999701976776 else to node 11.
					node=10 is a leaf node.
					node=11 is a leaf node.
			node=12 is a split node: go to node 13 if X[:, 4] <= 0.12999999895691872 else to node 18.
				node=13 is a split node: go to node 14 if X[:, 3] <= 0.6649999916553497 else to node 15.
					node=14 is a leaf node.
					node=15 is a split node: go to node 16 if X[:, 0] <= 0.23000000417232513 else to node 17.
						node=16 is a leaf node.
						node=17 is a leaf node.
				node=18 is a split node: go to node 19 if X[:, 3] <= 0.7899999916553497 else to node 30.
					node=19 is a split node: go to node 20 if X[:, 2] <= 0.8050000071525574 else to node 27.
						node=20 is a split node: go to node 21 if X[:, 3] <= 0.6399999856948853 else to node 24.
							node=21 is a split node: go to node 22 if X[:, 1] <= 0.054999999701976776 else to node 23.
								node=22 is a leaf node.
								node=23 is a leaf node.
							node=24 is a split node: go to node 25 if X[:, 3] <= 0.7050000131130219 else to node 26.
								node=25 is a leaf node.
								node=26 is a leaf node.
						node=27 is a split node: go to node 28 if X[:, 1] <= 0.5849999934434891 else to node 29.
							node=28 is a leaf node.
							node=29 is a leaf node.
					node=30 is a split node: go to node 31 if X[:, 3] <= 0.9600000083446503 else to node 32.
						node=31 is a leaf node.
						node=32 is a leaf node.
		node=33 is a split node: go to node 34 if X[:, 4] <= 0.6550000011920929 else to node 41.
			node=34 is a split node: go to node 35 if X[:, 4] <= 0.41499999165534973 else to node 38.
				node=35 is a split node: go to node 36 if X[:, 3] <= 0.5399999916553497 else to node 37.
					node=36 is a leaf node.
					node=37 is a leaf node.
				node=38 is a split node: go to node 39 if X[:, 0] <= 0.08999999985098839 else to node 40.
					node=39 is a leaf node.
					node=40 is a leaf node.
			node=41 is a split node: go to node 42 if X[:, 3] <= 0.5699999928474426 else to node 43.
				node=42 is a leaf node.
				node=43 is a leaf node.
	node=44 is a leaf node.

--------------------------------------
|     ccp_alphas for Post Pruning     |
--------------------------------------

[0.00000000e+00 2.61228947e-18 8.57157482e-18 1.24083750e-17
 1.92439863e-03 1.93316763e-03 2.74027006e-03 4.60139305e-03
 4.65116279e-03 5.66037736e-03 7.12761669e-03 7.54046548e-03
 1.02769950e-02 1.26698152e-02 1.48178638e-02 3.48125409e-02
 6.45587728e-02 1.46994380e-01 1.69846939e-01 2.10651925e-01]
--------------------------
|      Best Estimator     |
--------------------------

	DecisionTreeClassifier(class_weight='balanced', max_depth=8,
                       max_features='auto', random_state=0)

--------------------------
|     Best parameters     |
--------------------------
	Parameters of best estimator : 

	{'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 8, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 0, 'splitter': 'best'}

---------------------------------
|   No of CrossValidation sets   |
--------------------------------

	Total number of cross validation sets: 3

--------------------------
|        Best Score       |
--------------------------

	Average Cross Validate scores of best estimator : 

	0.9137447405329593

9.1.2 Pre-Pruning Method: Calculate the best Decision Tree Model using GridSearchCV using Gini Criterion with Balanced Data using SMOTE

In [1202]:
# start Grid search with Gini Criterion
parameters = {'criterion': ['gini'], 
              'splitter': ['best', 'random'], 
              'max_depth':[2,4,6,8,10,12],
              'min_samples_split':[2,3,4],
              'min_samples_leaf':[1,2],
              'min_weight_fraction_leaf':[0.0],
              'max_features':['auto', 'sqrt', 'log2'],
              'random_state':[0],
              'max_leaf_nodes':[None],
              'min_impurity_decrease':[0.0],
              'class_weight':[None,'balanced'],
              'ccp_alpha':[0.0]} #for pruning use [0.0, 0.1, 0.01, 0.001]

des_tree_gini_prepruning = DecisionTreeClassifier()
des_tree_gini_grid_prepruning = GridSearchCV(des_tree_gini_prepruning, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_gini_grid_results_prepruning = perform_model(des_tree_gini_grid_prepruning, X_train_smote, y_train_smote, X_test, y_test, labels)
# observe the attributes of the model 
print_grid_search_attributes(des_tree_gini_grid_results_prepruning)
training the model..
Fitting 3 folds for each of 432 candidates, totalling 1296 fits
Done 
 

---------------------
|   Training Time    |
---------------------
training_time(HH:MM:SS.ms) - 0:00:06.758121


Predicting test data
Done 
 

---------------------
|   Testing Time    |
---------------------
testing time(HH:MM:SS:ms) - 0:00:00.007996


---------------------
| Training Set Accuracy |
---------------------

    0.9944444444444445


---------------------
| Testing Set Accuracy  |
---------------------

    0.8916666666666667


-----------------------------
| Train Set Confusion Matrix |
-----------------------------

 [[90  0  0  0]
 [ 0 90  0  0]
 [ 0  2 88  0]
 [ 0  0  0 90]]

-----------------------------
| Test Set Confusion Matrix |
-----------------------------

 [[12  3  0  0]
 [ 2 36  1  0]
 [ 0  6 30  1]
 [ 0  0  0 29]]
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
-------------------------
| Classification Report |
-------------------------
              precision    recall  f1-score   support

           0       0.86      0.80      0.83        15
           1       0.80      0.92      0.86        39
           2       0.97      0.81      0.88        37
           3       0.97      1.00      0.98        29

    accuracy                           0.89       120
   macro avg       0.90      0.88      0.89       120
weighted avg       0.90      0.89      0.89       120

-------------------------
| ROC Curve |
-------------------------
------------------------------------
|   Decision Tree via. Plot_tree   |
------------------------------------
------------------------------------
|   Decision Tree via. graphviz   |
------------------------------------
Tree 0 PEG ≤ 0.69 gini = 0.725 samples = 120 value = [15, 39, 37, 29] 1 PEG ≤ 0.415 gini = 0.646 samples = 94 value = [15, 39, 37, 3] 0->1 True 34 gini = 0.0 samples = 26 value = [0, 0, 0, 26] 0->34 False 2 LPR ≤ 0.105 gini = 0.518 samples = 61 value = [15, 39, 7, 0] 1->2 31 LPR ≤ 0.835 gini = 0.165 samples = 33 value = [0, 0, 30, 3] 1->31 3 PEG ≤ 0.29 gini = 0.219 samples = 8 value = [7, 1, 0, 0] 2->3 6 PEG ≤ 0.13 gini = 0.446 samples = 53 value = [8, 38, 7, 0] 2->6 4 gini = 0.0 samples = 7 value = [7, 0, 0, 0] 3->4 5 gini = 0.0 samples = 1 value = [0, 1, 0, 0] 3->5 7 STG ≤ 0.41 gini = 0.375 samples = 8 value = [6, 2, 0, 0] 6->7 14 LPR ≤ 0.79 gini = 0.334 samples = 45 value = [2, 36, 7, 0] 6->14 8 PEG ≤ 0.06 gini = 0.48 samples = 5 value = [3, 2, 0, 0] 7->8 13 gini = 0.0 samples = 3 value = [3, 0, 0, 0] 7->13 9 gini = 0.0 samples = 1 value = [0, 1, 0, 0] 8->9 10 PEG ≤ 0.09 gini = 0.375 samples = 4 value = [3, 1, 0, 0] 8->10 11 gini = 0.0 samples = 2 value = [2, 0, 0, 0] 10->11 12 gini = 0.5 samples = 2 value = [1, 1, 0, 0] 10->12 15 STR ≤ 0.575 gini = 0.226 samples = 40 value = [2, 35, 3, 0] 14->15 28 SCG ≤ 0.105 gini = 0.32 samples = 5 value = [0, 1, 4, 0] 14->28 16 LPR ≤ 0.265 gini = 0.135 samples = 28 value = [1, 26, 1, 0] 15->16 23 SCG ≤ 0.125 gini = 0.403 samples = 12 value = [1, 9, 2, 0] 15->23 17 STR ≤ 0.08 gini = 0.32 samples = 5 value = [1, 4, 0, 0] 16->17 20 SCG ≤ 0.18 gini = 0.083 samples = 23 value = [0, 22, 1, 0] 16->20 18 gini = 0.0 samples = 1 value = [1, 0, 0, 0] 17->18 19 gini = 0.0 samples = 4 value = [0, 4, 0, 0] 17->19 21 gini = 0.375 samples = 4 value = [0, 3, 1, 0] 20->21 22 gini = 0.0 samples = 19 value = [0, 19, 0, 0] 20->22 24 gini = 0.0 samples = 1 value = [1, 0, 0, 0] 23->24 25 STR ≤ 0.605 gini = 0.298 samples = 11 value = [0, 9, 2, 0] 23->25 26 gini = 0.0 samples = 1 value = [0, 0, 1, 0] 25->26 27 gini = 0.18 samples = 10 value = [0, 9, 1, 0] 25->27 29 gini = 0.0 samples = 1 value = [0, 1, 0, 0] 28->29 30 gini = 0.0 samples = 4 value = [0, 0, 4, 0] 28->30 32 gini = 0.0 samples = 30 value = [0, 0, 30, 0] 31->32 33 gini = 0.0 samples = 3 value = [0, 0, 0, 3] 31->33
--------------------------------------
|      Decision Tree Description     |
--------------------------------------
The binary tree structure has 35 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 4] <= 0.6899999976158142 else to node 34.
	node=1 is a split node: go to node 2 if X[:, 4] <= 0.41499999165534973 else to node 31.
		node=2 is a split node: go to node 3 if X[:, 3] <= 0.10500000044703484 else to node 6.
			node=3 is a split node: go to node 4 if X[:, 4] <= 0.2900000065565109 else to node 5.
				node=4 is a leaf node.
				node=5 is a leaf node.
			node=6 is a split node: go to node 7 if X[:, 4] <= 0.12999999895691872 else to node 14.
				node=7 is a split node: go to node 8 if X[:, 0] <= 0.4099999964237213 else to node 13.
					node=8 is a split node: go to node 9 if X[:, 4] <= 0.06000000052154064 else to node 10.
						node=9 is a leaf node.
						node=10 is a split node: go to node 11 if X[:, 4] <= 0.08999999985098839 else to node 12.
							node=11 is a leaf node.
							node=12 is a leaf node.
					node=13 is a leaf node.
				node=14 is a split node: go to node 15 if X[:, 3] <= 0.7899999916553497 else to node 28.
					node=15 is a split node: go to node 16 if X[:, 2] <= 0.574999988079071 else to node 23.
						node=16 is a split node: go to node 17 if X[:, 3] <= 0.26500000059604645 else to node 20.
							node=17 is a split node: go to node 18 if X[:, 2] <= 0.0800000000745058 else to node 19.
								node=18 is a leaf node.
								node=19 is a leaf node.
							node=20 is a split node: go to node 21 if X[:, 1] <= 0.17999999597668648 else to node 22.
								node=21 is a leaf node.
								node=22 is a leaf node.
						node=23 is a split node: go to node 24 if X[:, 1] <= 0.1250000037252903 else to node 25.
							node=24 is a leaf node.
							node=25 is a split node: go to node 26 if X[:, 2] <= 0.6049999892711639 else to node 27.
								node=26 is a leaf node.
								node=27 is a leaf node.
					node=28 is a split node: go to node 29 if X[:, 1] <= 0.10499999672174454 else to node 30.
						node=29 is a leaf node.
						node=30 is a leaf node.
		node=31 is a split node: go to node 32 if X[:, 3] <= 0.8350000083446503 else to node 33.
			node=32 is a leaf node.
			node=33 is a leaf node.
	node=34 is a leaf node.

--------------------------------------
|     ccp_alphas for Post Pruning     |
--------------------------------------

[0.         0.00182648 0.00274024 0.00457875 0.00462963 0.00527066
 0.00540841 0.00709085 0.01256162 0.01446759 0.03508376 0.07395411
 0.13944186 0.16355609 0.21043512]
--------------------------
|      Best Estimator     |
--------------------------

	DecisionTreeClassifier(max_depth=8, max_features='auto', min_samples_split=3,
                       random_state=0)

--------------------------
|     Best parameters     |
--------------------------
	Parameters of best estimator : 

	{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 8, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 3, 'min_weight_fraction_leaf': 0.0, 'random_state': 0, 'splitter': 'best'}

---------------------------------
|   No of CrossValidation sets   |
--------------------------------

	Total number of cross validation sets: 3

--------------------------
|        Best Score       |
--------------------------

	Average Cross Validate scores of best estimator : 

	0.9305555555555555

9.1.3 Pre-Pruning Method: Calculate the best Decision Tree Model using GridSearchCV using Entropy Criterion with Original Data

In [1203]:
# start Grid search with Entropy Criterion
parameters = {'criterion': ['entropy'], 
              'splitter': ['best', 'random'], 
              'max_depth':[2,4,6,8,10,12],
              'min_samples_split':[2,3,4],
              'min_samples_leaf':[1,2],
              'min_weight_fraction_leaf':[0.0],
              'max_features':['auto', 'sqrt', 'log2'],
              'random_state':[0],
              'max_leaf_nodes':[None],
              'min_impurity_decrease':[0.0],
              'class_weight':[None,'balanced'],
              'ccp_alpha':[0.0]}
des_tree_entropy_prepruning = DecisionTreeClassifier()
des_tree_entropy_grid_prepruning = GridSearchCV(des_tree_entropy_prepruning, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_entropy_grid_results_prepruning = perform_model(des_tree_entropy_grid_prepruning, X_train, y_train, X_test, y_test, labels)
# observe the attributes of the model 
print_grid_search_attributes(des_tree_entropy_grid_results_prepruning)
training the model..
Fitting 3 folds for each of 432 candidates, totalling 1296 fits
Done 
 

---------------------
|   Training Time    |
---------------------
training_time(HH:MM:SS.ms) - 0:00:05.407894


Predicting test data
Done 
 

---------------------
|   Testing Time    |
---------------------
testing time(HH:MM:SS:ms) - 0:00:00.009995


---------------------
| Training Set Accuracy |
---------------------

    0.9964028776978417


---------------------
| Testing Set Accuracy  |
---------------------

    0.9166666666666666


-----------------------------
| Train Set Confusion Matrix |
-----------------------------

 [[35  0  0  0]
 [ 0 90  0  0]
 [ 0  1 84  0]
 [ 0  0  0 68]]

-----------------------------
| Test Set Confusion Matrix |
-----------------------------

 [[12  3  0  0]
 [ 2 36  1  0]
 [ 0  3 33  1]
 [ 0  0  0 29]]
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
-------------------------
| Classification Report |
-------------------------
              precision    recall  f1-score   support

           0       0.86      0.80      0.83        15
           1       0.86      0.92      0.89        39
           2       0.97      0.89      0.93        37
           3       0.97      1.00      0.98        29

    accuracy                           0.92       120
   macro avg       0.91      0.90      0.91       120
weighted avg       0.92      0.92      0.92       120

-------------------------
| ROC Curve |
-------------------------
------------------------------------
|   Decision Tree via. Plot_tree   |
------------------------------------
------------------------------------
|   Decision Tree via. graphviz   |
------------------------------------
Tree 0 PEG ≤ 0.415 entropy = 2.0 samples = 120 value = [30.0, 30.0, 30.0, 30.0] 1 PEG ≤ 0.135 entropy = 1.338 samples = 61 value = [30.0, 30.0, 5.676, 0.0] 0->1 True 28 LPR ≤ 0.135 entropy = 0.992 samples = 59 value = [0.0, 0.0, 24.324, 30.0] 0->28 False 2 LPR ≤ 0.665 entropy = 0.328 samples = 14 value = [24.0, 1.538, 0.0, 0.0] 1->2 7 PEG ≤ 0.255 entropy = 1.161 samples = 47 value = [6.0, 28.462, 5.676, 0.0] 1->7 3 entropy = 0.0 samples = 12 value = [24, 0, 0, 0] 2->3 4 STR ≤ 0.22 entropy = 0.0 samples = 2 value = [0.0, 1.538, 0.0, 0.0] 2->4 5 entropy = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 4->5 6 entropy = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 4->6 8 STG ≤ 0.35 entropy = 0.927 samples = 18 value = [6.0, 11.538, 0.0, 0.0] 7->8 19 LPR ≤ 0.555 entropy = 0.813 samples = 29 value = [0.0, 16.923, 5.676, 0.0] 7->19 9 PEG ≤ 0.195 entropy = 0.965 samples = 8 value = [6.0, 3.846, 0.0, 0.0] 8->9 16 STG ≤ 0.385 entropy = 0.0 samples = 10 value = [0.0, 7.692, 0.0, 0.0] 8->16 10 entropy = 0.0 samples = 2 value = [0.0, 1.538, 0.0, 0.0] 9->10 11 PEG ≤ 0.205 entropy = 0.852 samples = 6 value = [6.0, 2.308, 0.0, 0.0] 9->11 12 entropy = 0.0 samples = 1 value = [2, 0, 0, 0] 11->12 13 LPR ≤ 0.475 entropy = 0.947 samples = 5 value = [4.0, 2.308, 0.0, 0.0] 11->13 14 entropy = 0.0 samples = 2 value = [4, 0, 0, 0] 13->14 15 entropy = -0.0 samples = 3 value = [0.0, 2.308, 0.0, 0.0] 13->15 17 entropy = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 16->17 18 entropy = 0.0 samples = 9 value = [0.0, 6.923, 0.0, 0.0] 16->18 20 PEG ≤ 0.27 entropy = 0.0 samples = 21 value = [0.0, 16.154, 0.0, 0.0] 19->20 23 SCG ≤ 0.35 entropy = 0.528 samples = 8 value = [0.0, 0.769, 5.676, 0.0] 19->23 21 entropy = 0.0 samples = 2 value = [0.0, 1.538, 0.0, 0.0] 20->21 22 entropy = 0.0 samples = 19 value = [0.0, 14.615, 0.0, 0.0] 20->22 24 entropy = 0.0 samples = 5 value = [0.0, 0.0, 4.054, 0.0] 23->24 25 SCG ≤ 0.535 entropy = 0.906 samples = 3 value = [0.0, 0.769, 1.622, 0.0] 23->25 26 entropy = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 25->26 27 entropy = 0.0 samples = 2 value = [0.0, 0.0, 1.622, 0.0] 25->27 29 entropy = 0.0 samples = 7 value = [0.0, 0.0, 5.676, 0.0] 28->29 30 PEG ≤ 0.665 entropy = 0.96 samples = 52 value = [0.0, 0.0, 18.649, 30.0] 28->30 31 LPR ≤ 0.85 entropy = 0.469 samples = 25 value = [0.0, 0.0, 18.649, 2.069] 30->31 36 STR ≤ 0.125 entropy = 0.0 samples = 27 value = [0.0, 0.0, 0.0, 27.931] 30->36 32 entropy = 0.0 samples = 23 value = [0.0, 0.0, 18.649, 0.0] 31->32 33 PEG ≤ 0.61 entropy = 0.0 samples = 2 value = [0.0, 0.0, 0.0, 2.069] 31->33 34 entropy = 0.0 samples = 1 value = [0.0, 0.0, 0.0, 1.034] 33->34 35 entropy = 0.0 samples = 1 value = [0.0, 0.0, 0.0, 1.034] 33->35 37 entropy = 0.0 samples = 1 value = [0.0, 0.0, 0.0, 1.034] 36->37 38 entropy = 0.0 samples = 26 value = [0.0, 0.0, 0.0, 26.897] 36->38
--------------------------------------
|      Decision Tree Description     |
--------------------------------------
The binary tree structure has 39 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 4] <= 0.41499999165534973 else to node 28.
	node=1 is a split node: go to node 2 if X[:, 4] <= 0.13499999791383743 else to node 7.
		node=2 is a split node: go to node 3 if X[:, 3] <= 0.6649999916553497 else to node 4.
			node=3 is a leaf node.
			node=4 is a split node: go to node 5 if X[:, 2] <= 0.2199999950826168 else to node 6.
				node=5 is a leaf node.
				node=6 is a leaf node.
		node=7 is a split node: go to node 8 if X[:, 4] <= 0.2549999952316284 else to node 19.
			node=8 is a split node: go to node 9 if X[:, 0] <= 0.3499999940395355 else to node 16.
				node=9 is a split node: go to node 10 if X[:, 4] <= 0.19500000029802322 else to node 11.
					node=10 is a leaf node.
					node=11 is a split node: go to node 12 if X[:, 4] <= 0.20499999821186066 else to node 13.
						node=12 is a leaf node.
						node=13 is a split node: go to node 14 if X[:, 3] <= 0.4749999940395355 else to node 15.
							node=14 is a leaf node.
							node=15 is a leaf node.
				node=16 is a split node: go to node 17 if X[:, 0] <= 0.38499999046325684 else to node 18.
					node=17 is a leaf node.
					node=18 is a leaf node.
			node=19 is a split node: go to node 20 if X[:, 3] <= 0.5550000071525574 else to node 23.
				node=20 is a split node: go to node 21 if X[:, 4] <= 0.26999999582767487 else to node 22.
					node=21 is a leaf node.
					node=22 is a leaf node.
				node=23 is a split node: go to node 24 if X[:, 1] <= 0.3499999940395355 else to node 25.
					node=24 is a leaf node.
					node=25 is a split node: go to node 26 if X[:, 1] <= 0.5349999815225601 else to node 27.
						node=26 is a leaf node.
						node=27 is a leaf node.
	node=28 is a split node: go to node 29 if X[:, 3] <= 0.13499999791383743 else to node 30.
		node=29 is a leaf node.
		node=30 is a split node: go to node 31 if X[:, 4] <= 0.6650000214576721 else to node 36.
			node=31 is a split node: go to node 32 if X[:, 3] <= 0.8500000238418579 else to node 33.
				node=32 is a leaf node.
				node=33 is a split node: go to node 34 if X[:, 4] <= 0.6100000143051147 else to node 35.
					node=34 is a leaf node.
					node=35 is a leaf node.
			node=36 is a split node: go to node 37 if X[:, 2] <= 0.12500000186264515 else to node 38.
				node=37 is a leaf node.
				node=38 is a leaf node.

--------------------------------------
|     ccp_alphas for Post Pruning     |
--------------------------------------

[0.00000000e+00 3.76873706e-17 2.35305905e-16 3.04866340e-16
 5.18649405e-03 5.71558615e-03 6.11343912e-03 7.44464252e-03
 8.48661518e-03 8.63291106e-03 9.18829107e-03 9.82973920e-03
 1.09367460e-02 1.12140546e-02 1.14229162e-02 1.68540384e-02
 2.51848839e-02 2.95475235e-02 3.13389697e-02 6.73639191e-02
 9.76412209e-02 1.82548062e-01 3.69263963e-01 8.12292430e-01]
--------------------------
|      Best Estimator     |
--------------------------

	DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=8, max_features='auto', random_state=0)

--------------------------
|     Best parameters     |
--------------------------
	Parameters of best estimator : 

	{'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 8, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 0, 'splitter': 'best'}

---------------------------------
|   No of CrossValidation sets   |
--------------------------------

	Total number of cross validation sets: 3

--------------------------
|        Best Score       |
--------------------------

	Average Cross Validate scores of best estimator : 

	0.9426133707339878

9.1.4 Pre-Pruning Method: Calculate the best Decision Tree Model using GridSearchCV using Entropy Criterion with Balanced Data using SMOTE

In [1204]:
# start Grid search with Entropy Criterion
parameters = {'criterion': ['entropy'], 
              'splitter': ['best', 'random'], 
              'max_depth':[2,4,6,8,10,12],
              'min_samples_split':[2,3,4],
              'min_samples_leaf':[1,2],
              'min_weight_fraction_leaf':[0.0],
              'max_features':['auto', 'sqrt', 'log2'],
              'random_state':[0],
              'max_leaf_nodes':[None],
              'min_impurity_decrease':[0.0],
              'class_weight':[None,'balanced'],
              'ccp_alpha':[0.0]}
des_tree_entropy_prepruning = DecisionTreeClassifier()
des_tree_entropy_grid_prepruning = GridSearchCV(des_tree_entropy_prepruning, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_entropy_grid_results_prepruning = perform_model(des_tree_entropy_grid_prepruning, X_train_smote, y_train_smote, X_test, y_test, labels)
# observe the attributes of the model 
print_grid_search_attributes(des_tree_entropy_grid_results_prepruning)
training the model..
Fitting 3 folds for each of 432 candidates, totalling 1296 fits
Done 
 

---------------------
|   Training Time    |
---------------------
training_time(HH:MM:SS.ms) - 0:00:07.005537


Predicting test data
Done 
 

---------------------
|   Testing Time    |
---------------------
testing time(HH:MM:SS:ms) - 0:00:00.007997


---------------------
| Training Set Accuracy |
---------------------

    0.9833333333333333


---------------------
| Testing Set Accuracy  |
---------------------

    0.9083333333333333


-----------------------------
| Train Set Confusion Matrix |
-----------------------------

 [[90  0  0  0]
 [ 1 89  0  0]
 [ 0  2 87  1]
 [ 0  0  2 88]]

-----------------------------
| Test Set Confusion Matrix |
-----------------------------

 [[12  3  0  0]
 [ 2 36  1  0]
 [ 0  3 33  1]
 [ 0  0  1 28]]
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
-------------------------
| Classification Report |
-------------------------
              precision    recall  f1-score   support

           0       0.86      0.80      0.83        15
           1       0.86      0.92      0.89        39
           2       0.94      0.89      0.92        37
           3       0.97      0.97      0.97        29

    accuracy                           0.91       120
   macro avg       0.91      0.90      0.90       120
weighted avg       0.91      0.91      0.91       120

-------------------------
| ROC Curve |
-------------------------
------------------------------------
|   Decision Tree via. Plot_tree   |
------------------------------------
------------------------------------
|   Decision Tree via. graphviz   |
------------------------------------
Tree 0 PEG ≤ 0.415 entropy = 1.921 samples = 120 value = [15, 39, 37, 29] 1 PEG ≤ 0.135 entropy = 1.269 samples = 61 value = [15, 39, 7, 0] 0->1 True 20 PEG ≤ 0.69 entropy = 1.0 samples = 59 value = [0, 0, 30, 29] 0->20 False 2 LPR ≤ 0.665 entropy = 0.592 samples = 14 value = [12, 2, 0, 0] 1->2 5 PEG ≤ 0.255 entropy = 0.934 samples = 47 value = [3, 37, 7, 0] 1->5 3 entropy = 0.0 samples = 12 value = [12, 0, 0, 0] 2->3 4 entropy = 0.0 samples = 2 value = [0, 2, 0, 0] 2->4 6 STR ≤ 0.305 entropy = 0.65 samples = 18 value = [3, 15, 0, 0] 5->6 15 LPR ≤ 0.555 entropy = 0.797 samples = 29 value = [0, 22, 7, 0] 5->15 7 STG ≤ 0.42 entropy = 0.918 samples = 6 value = [2, 4, 0, 0] 6->7 10 STR ≤ 0.63 entropy = 0.414 samples = 12 value = [1, 11, 0, 0] 6->10 8 entropy = 0.918 samples = 3 value = [2, 1, 0, 0] 7->8 9 entropy = 0.0 samples = 3 value = [0, 3, 0, 0] 7->9 11 entropy = 0.0 samples = 7 value = [0, 7, 0, 0] 10->11 12 PEG ≤ 0.195 entropy = 0.722 samples = 5 value = [1, 4, 0, 0] 10->12 13 entropy = 0.0 samples = 2 value = [0, 2, 0, 0] 12->13 14 entropy = 0.918 samples = 3 value = [1, 2, 0, 0] 12->14 16 entropy = 0.0 samples = 21 value = [0, 21, 0, 0] 15->16 17 STG ≤ 0.185 entropy = 0.544 samples = 8 value = [0, 1, 7, 0] 15->17 18 entropy = 1.0 samples = 2 value = [0, 1, 1, 0] 17->18 19 entropy = 0.0 samples = 6 value = [0, 0, 6, 0] 17->19 21 STR ≤ 0.47 entropy = 0.439 samples = 33 value = [0, 0, 30, 3] 20->21 26 entropy = 0.0 samples = 26 value = [0, 0, 0, 26] 20->26 22 entropy = 0.0 samples = 16 value = [0, 0, 16, 0] 21->22 23 LPR ≤ 0.725 entropy = 0.672 samples = 17 value = [0, 0, 14, 3] 21->23 24 entropy = 0.0 samples = 14 value = [0, 0, 14, 0] 23->24 25 entropy = 0.0 samples = 3 value = [0, 0, 0, 3] 23->25
--------------------------------------
|      Decision Tree Description     |
--------------------------------------
The binary tree structure has 27 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 4] <= 0.41499999165534973 else to node 20.
	node=1 is a split node: go to node 2 if X[:, 4] <= 0.13499999791383743 else to node 5.
		node=2 is a split node: go to node 3 if X[:, 3] <= 0.6649999916553497 else to node 4.
			node=3 is a leaf node.
			node=4 is a leaf node.
		node=5 is a split node: go to node 6 if X[:, 4] <= 0.2549999952316284 else to node 15.
			node=6 is a split node: go to node 7 if X[:, 2] <= 0.3050000071525574 else to node 10.
				node=7 is a split node: go to node 8 if X[:, 0] <= 0.41999998688697815 else to node 9.
					node=8 is a leaf node.
					node=9 is a leaf node.
				node=10 is a split node: go to node 11 if X[:, 2] <= 0.6299999952316284 else to node 12.
					node=11 is a leaf node.
					node=12 is a split node: go to node 13 if X[:, 4] <= 0.19500000029802322 else to node 14.
						node=13 is a leaf node.
						node=14 is a leaf node.
			node=15 is a split node: go to node 16 if X[:, 3] <= 0.5550000071525574 else to node 17.
				node=16 is a leaf node.
				node=17 is a split node: go to node 18 if X[:, 0] <= 0.1850000061094761 else to node 19.
					node=18 is a leaf node.
					node=19 is a leaf node.
	node=20 is a split node: go to node 21 if X[:, 4] <= 0.6899999976158142 else to node 26.
		node=21 is a split node: go to node 22 if X[:, 2] <= 0.4699999988079071 else to node 23.
			node=22 is a leaf node.
			node=23 is a split node: go to node 24 if X[:, 3] <= 0.7249999940395355 else to node 25.
				node=24 is a leaf node.
				node=25 is a leaf node.
		node=26 is a leaf node.

--------------------------------------
|     ccp_alphas for Post Pruning     |
--------------------------------------

[0.         0.00318124 0.00345865 0.00492899 0.00564279 0.01053307
 0.01111111 0.01125547 0.01193223 0.01771874 0.02599923 0.02899021
 0.03215664 0.06407498 0.09761783 0.18459135 0.35301946 0.80604746]
--------------------------
|      Best Estimator     |
--------------------------

	DecisionTreeClassifier(criterion='entropy', max_depth=10, max_features='auto',
                       min_samples_leaf=2, random_state=0)

--------------------------
|     Best parameters     |
--------------------------
	Parameters of best estimator : 

	{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 2, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 0, 'splitter': 'best'}

---------------------------------
|   No of CrossValidation sets   |
--------------------------------

	Total number of cross validation sets: 3

--------------------------
|        Best Score       |
--------------------------

	Average Cross Validate scores of best estimator : 

	0.9416666666666668

9.1.5 Pre-Pruning Method: Calculate the best Decision Tree Model using GridSearchCV using Log_Loss Criterion with Original Data

In [1205]:
# start Grid search with Log_Loss Criterion
parameters = {'criterion': ['log_loss'], 
              'splitter': ['best', 'random'], 
              'max_depth':[2,4,6,8,10,12],
              'min_samples_split':[2,3,4],
              'min_samples_leaf':[1,2],
              'min_weight_fraction_leaf':[0.0],
              'max_features':['auto', 'sqrt', 'log2'],
              'random_state':[0],
              'max_leaf_nodes':[None],
              'min_impurity_decrease':[0.0],
              'class_weight':[None,'balanced'],
              'ccp_alpha':[0.0]}
des_tree_logloss_prepruning = DecisionTreeClassifier()
des_tree_logloss_grid_prepruning = GridSearchCV(des_tree_logloss_prepruning, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_logloss_grid_results_prepruning = perform_model(des_tree_logloss_grid_prepruning, X_train, y_train, X_test, y_test, labels)
# observe the attributes of the model 
print_grid_search_attributes(des_tree_logloss_grid_results_prepruning)
training the model..
Fitting 3 folds for each of 432 candidates, totalling 1296 fits
Done 
 

---------------------
|   Training Time    |
---------------------
training_time(HH:MM:SS.ms) - 0:00:05.210007


Predicting test data
Done 
 

---------------------
|   Testing Time    |
---------------------
testing time(HH:MM:SS:ms) - 0:00:00.005997


---------------------
| Training Set Accuracy |
---------------------

    0.9964028776978417


---------------------
| Testing Set Accuracy  |
---------------------

    0.9166666666666666


-----------------------------
| Train Set Confusion Matrix |
-----------------------------

 [[35  0  0  0]
 [ 0 90  0  0]
 [ 0  1 84  0]
 [ 0  0  0 68]]

-----------------------------
| Test Set Confusion Matrix |
-----------------------------

 [[12  3  0  0]
 [ 2 36  1  0]
 [ 0  3 33  1]
 [ 0  0  0 29]]
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
-------------------------
| Classification Report |
-------------------------
              precision    recall  f1-score   support

           0       0.86      0.80      0.83        15
           1       0.86      0.92      0.89        39
           2       0.97      0.89      0.93        37
           3       0.97      1.00      0.98        29

    accuracy                           0.92       120
   macro avg       0.91      0.90      0.91       120
weighted avg       0.92      0.92      0.92       120

-------------------------
| ROC Curve |
-------------------------
------------------------------------
|   Decision Tree via. Plot_tree   |
------------------------------------
------------------------------------
|   Decision Tree via. graphviz   |
------------------------------------
Tree 0 PEG ≤ 0.415 log_loss = 2.0 samples = 120 value = [30.0, 30.0, 30.0, 30.0] 1 PEG ≤ 0.135 log_loss = 1.338 samples = 61 value = [30.0, 30.0, 5.676, 0.0] 0->1 True 28 LPR ≤ 0.135 log_loss = 0.992 samples = 59 value = [0.0, 0.0, 24.324, 30.0] 0->28 False 2 LPR ≤ 0.665 log_loss = 0.328 samples = 14 value = [24.0, 1.538, 0.0, 0.0] 1->2 7 PEG ≤ 0.255 log_loss = 1.161 samples = 47 value = [6.0, 28.462, 5.676, 0.0] 1->7 3 log_loss = 0.0 samples = 12 value = [24, 0, 0, 0] 2->3 4 STR ≤ 0.22 log_loss = 0.0 samples = 2 value = [0.0, 1.538, 0.0, 0.0] 2->4 5 log_loss = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 4->5 6 log_loss = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 4->6 8 STG ≤ 0.35 log_loss = 0.927 samples = 18 value = [6.0, 11.538, 0.0, 0.0] 7->8 19 LPR ≤ 0.555 log_loss = 0.813 samples = 29 value = [0.0, 16.923, 5.676, 0.0] 7->19 9 PEG ≤ 0.195 log_loss = 0.965 samples = 8 value = [6.0, 3.846, 0.0, 0.0] 8->9 16 STG ≤ 0.385 log_loss = 0.0 samples = 10 value = [0.0, 7.692, 0.0, 0.0] 8->16 10 log_loss = 0.0 samples = 2 value = [0.0, 1.538, 0.0, 0.0] 9->10 11 PEG ≤ 0.205 log_loss = 0.852 samples = 6 value = [6.0, 2.308, 0.0, 0.0] 9->11 12 log_loss = 0.0 samples = 1 value = [2, 0, 0, 0] 11->12 13 LPR ≤ 0.475 log_loss = 0.947 samples = 5 value = [4.0, 2.308, 0.0, 0.0] 11->13 14 log_loss = 0.0 samples = 2 value = [4, 0, 0, 0] 13->14 15 log_loss = -0.0 samples = 3 value = [0.0, 2.308, 0.0, 0.0] 13->15 17 log_loss = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 16->17 18 log_loss = 0.0 samples = 9 value = [0.0, 6.923, 0.0, 0.0] 16->18 20 PEG ≤ 0.27 log_loss = 0.0 samples = 21 value = [0.0, 16.154, 0.0, 0.0] 19->20 23 SCG ≤ 0.35 log_loss = 0.528 samples = 8 value = [0.0, 0.769, 5.676, 0.0] 19->23 21 log_loss = 0.0 samples = 2 value = [0.0, 1.538, 0.0, 0.0] 20->21 22 log_loss = 0.0 samples = 19 value = [0.0, 14.615, 0.0, 0.0] 20->22 24 log_loss = 0.0 samples = 5 value = [0.0, 0.0, 4.054, 0.0] 23->24 25 SCG ≤ 0.535 log_loss = 0.906 samples = 3 value = [0.0, 0.769, 1.622, 0.0] 23->25 26 log_loss = 0.0 samples = 1 value = [0.0, 0.769, 0.0, 0.0] 25->26 27 log_loss = 0.0 samples = 2 value = [0.0, 0.0, 1.622, 0.0] 25->27 29 log_loss = 0.0 samples = 7 value = [0.0, 0.0, 5.676, 0.0] 28->29 30 PEG ≤ 0.665 log_loss = 0.96 samples = 52 value = [0.0, 0.0, 18.649, 30.0] 28->30 31 LPR ≤ 0.85 log_loss = 0.469 samples = 25 value = [0.0, 0.0, 18.649, 2.069] 30->31 36 STR ≤ 0.125 log_loss = 0.0 samples = 27 value = [0.0, 0.0, 0.0, 27.931] 30->36 32 log_loss = 0.0 samples = 23 value = [0.0, 0.0, 18.649, 0.0] 31->32 33 PEG ≤ 0.61 log_loss = 0.0 samples = 2 value = [0.0, 0.0, 0.0, 2.069] 31->33 34 log_loss = 0.0 samples = 1 value = [0.0, 0.0, 0.0, 1.034] 33->34 35 log_loss = 0.0 samples = 1 value = [0.0, 0.0, 0.0, 1.034] 33->35 37 log_loss = 0.0 samples = 1 value = [0.0, 0.0, 0.0, 1.034] 36->37 38 log_loss = 0.0 samples = 26 value = [0.0, 0.0, 0.0, 26.897] 36->38
--------------------------------------
|      Decision Tree Description     |
--------------------------------------
The binary tree structure has 39 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 4] <= 0.41499999165534973 else to node 28.
	node=1 is a split node: go to node 2 if X[:, 4] <= 0.13499999791383743 else to node 7.
		node=2 is a split node: go to node 3 if X[:, 3] <= 0.6649999916553497 else to node 4.
			node=3 is a leaf node.
			node=4 is a split node: go to node 5 if X[:, 2] <= 0.2199999950826168 else to node 6.
				node=5 is a leaf node.
				node=6 is a leaf node.
		node=7 is a split node: go to node 8 if X[:, 4] <= 0.2549999952316284 else to node 19.
			node=8 is a split node: go to node 9 if X[:, 0] <= 0.3499999940395355 else to node 16.
				node=9 is a split node: go to node 10 if X[:, 4] <= 0.19500000029802322 else to node 11.
					node=10 is a leaf node.
					node=11 is a split node: go to node 12 if X[:, 4] <= 0.20499999821186066 else to node 13.
						node=12 is a leaf node.
						node=13 is a split node: go to node 14 if X[:, 3] <= 0.4749999940395355 else to node 15.
							node=14 is a leaf node.
							node=15 is a leaf node.
				node=16 is a split node: go to node 17 if X[:, 0] <= 0.38499999046325684 else to node 18.
					node=17 is a leaf node.
					node=18 is a leaf node.
			node=19 is a split node: go to node 20 if X[:, 3] <= 0.5550000071525574 else to node 23.
				node=20 is a split node: go to node 21 if X[:, 4] <= 0.26999999582767487 else to node 22.
					node=21 is a leaf node.
					node=22 is a leaf node.
				node=23 is a split node: go to node 24 if X[:, 1] <= 0.3499999940395355 else to node 25.
					node=24 is a leaf node.
					node=25 is a split node: go to node 26 if X[:, 1] <= 0.5349999815225601 else to node 27.
						node=26 is a leaf node.
						node=27 is a leaf node.
	node=28 is a split node: go to node 29 if X[:, 3] <= 0.13499999791383743 else to node 30.
		node=29 is a leaf node.
		node=30 is a split node: go to node 31 if X[:, 4] <= 0.6650000214576721 else to node 36.
			node=31 is a split node: go to node 32 if X[:, 3] <= 0.8500000238418579 else to node 33.
				node=32 is a leaf node.
				node=33 is a split node: go to node 34 if X[:, 4] <= 0.6100000143051147 else to node 35.
					node=34 is a leaf node.
					node=35 is a leaf node.
			node=36 is a split node: go to node 37 if X[:, 2] <= 0.12500000186264515 else to node 38.
				node=37 is a leaf node.
				node=38 is a leaf node.

--------------------------------------
|     ccp_alphas for Post Pruning     |
--------------------------------------

[0.00000000e+00 3.76873706e-17 2.35305905e-16 3.04866340e-16
 5.18649405e-03 5.71558615e-03 6.11343912e-03 7.44464252e-03
 8.48661518e-03 8.63291106e-03 9.18829107e-03 9.82973920e-03
 1.09367460e-02 1.12140546e-02 1.14229162e-02 1.68540384e-02
 2.51848839e-02 2.95475235e-02 3.13389697e-02 6.73639191e-02
 9.76412209e-02 1.82548062e-01 3.69263963e-01 8.12292430e-01]
--------------------------
|      Best Estimator     |
--------------------------

	DecisionTreeClassifier(class_weight='balanced', criterion='log_loss',
                       max_depth=8, max_features='auto', random_state=0)

--------------------------
|     Best parameters     |
--------------------------
	Parameters of best estimator : 

	{'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'log_loss', 'max_depth': 8, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 0, 'splitter': 'best'}

---------------------------------
|   No of CrossValidation sets   |
--------------------------------

	Total number of cross validation sets: 3

--------------------------
|        Best Score       |
--------------------------

	Average Cross Validate scores of best estimator : 

	0.9426133707339878

9.1.6 Pre-Pruning Method: Calculate the best Decision Tree Model using GridSearchCV using Log_Loss Criterion with Balanced Data using SMOTE

In [1206]:
# start Grid search with Log_Loss Criterion
parameters = {'criterion': ['log_loss'], 
              'splitter': ['best', 'random'], 
              'max_depth':[2,4,6,8,10,12],
              'min_samples_split':[2,3,4],
              'min_samples_leaf':[1,2],
              'min_weight_fraction_leaf':[0.0],
              'max_features':['auto', 'sqrt', 'log2'],
              'random_state':[0],
              'max_leaf_nodes':[None],
              'min_impurity_decrease':[0.0],
              'class_weight':[None,'balanced'],
              'ccp_alpha':[0.0]}
des_tree_logloss_prepruning = DecisionTreeClassifier()
des_tree_logloss_grid_prepruning = GridSearchCV(des_tree_logloss_prepruning, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_logloss_grid_results_prepruning = perform_model(des_tree_logloss_grid_prepruning, X_train_smote, y_train_smote, X_test, y_test, labels)
# observe the attributes of the model 
print_grid_search_attributes(des_tree_logloss_grid_results_prepruning)
training the model..
Fitting 3 folds for each of 432 candidates, totalling 1296 fits
Done 
 

---------------------
|   Training Time    |
---------------------
training_time(HH:MM:SS.ms) - 0:00:04.892190


Predicting test data
Done 
 

---------------------
|   Testing Time    |
---------------------
testing time(HH:MM:SS:ms) - 0:00:00.004997


---------------------
| Training Set Accuracy |
---------------------

    0.9833333333333333


---------------------
| Testing Set Accuracy  |
---------------------

    0.9083333333333333


-----------------------------
| Train Set Confusion Matrix |
-----------------------------

 [[90  0  0  0]
 [ 1 89  0  0]
 [ 0  2 87  1]
 [ 0  0  2 88]]

-----------------------------
| Test Set Confusion Matrix |
-----------------------------

 [[12  3  0  0]
 [ 2 36  1  0]
 [ 0  3 33  1]
 [ 0  0  1 28]]
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
-------------------------
| Classification Report |
-------------------------
              precision    recall  f1-score   support

           0       0.86      0.80      0.83        15
           1       0.86      0.92      0.89        39
           2       0.94      0.89      0.92        37
           3       0.97      0.97      0.97        29

    accuracy                           0.91       120
   macro avg       0.91      0.90      0.90       120
weighted avg       0.91      0.91      0.91       120

-------------------------
| ROC Curve |
-------------------------
------------------------------------
|   Decision Tree via. Plot_tree   |
------------------------------------
------------------------------------
|   Decision Tree via. graphviz   |
------------------------------------
Tree 0 PEG ≤ 0.415 log_loss = 1.921 samples = 120 value = [15, 39, 37, 29] 1 PEG ≤ 0.135 log_loss = 1.269 samples = 61 value = [15, 39, 7, 0] 0->1 True 20 PEG ≤ 0.69 log_loss = 1.0 samples = 59 value = [0, 0, 30, 29] 0->20 False 2 LPR ≤ 0.665 log_loss = 0.592 samples = 14 value = [12, 2, 0, 0] 1->2 5 PEG ≤ 0.255 log_loss = 0.934 samples = 47 value = [3, 37, 7, 0] 1->5 3 log_loss = 0.0 samples = 12 value = [12, 0, 0, 0] 2->3 4 log_loss = 0.0 samples = 2 value = [0, 2, 0, 0] 2->4 6 STR ≤ 0.305 log_loss = 0.65 samples = 18 value = [3, 15, 0, 0] 5->6 15 LPR ≤ 0.555 log_loss = 0.797 samples = 29 value = [0, 22, 7, 0] 5->15 7 STG ≤ 0.42 log_loss = 0.918 samples = 6 value = [2, 4, 0, 0] 6->7 10 STR ≤ 0.63 log_loss = 0.414 samples = 12 value = [1, 11, 0, 0] 6->10 8 log_loss = 0.918 samples = 3 value = [2, 1, 0, 0] 7->8 9 log_loss = 0.0 samples = 3 value = [0, 3, 0, 0] 7->9 11 log_loss = 0.0 samples = 7 value = [0, 7, 0, 0] 10->11 12 PEG ≤ 0.195 log_loss = 0.722 samples = 5 value = [1, 4, 0, 0] 10->12 13 log_loss = 0.0 samples = 2 value = [0, 2, 0, 0] 12->13 14 log_loss = 0.918 samples = 3 value = [1, 2, 0, 0] 12->14 16 log_loss = 0.0 samples = 21 value = [0, 21, 0, 0] 15->16 17 STG ≤ 0.185 log_loss = 0.544 samples = 8 value = [0, 1, 7, 0] 15->17 18 log_loss = 1.0 samples = 2 value = [0, 1, 1, 0] 17->18 19 log_loss = 0.0 samples = 6 value = [0, 0, 6, 0] 17->19 21 STR ≤ 0.47 log_loss = 0.439 samples = 33 value = [0, 0, 30, 3] 20->21 26 log_loss = 0.0 samples = 26 value = [0, 0, 0, 26] 20->26 22 log_loss = 0.0 samples = 16 value = [0, 0, 16, 0] 21->22 23 LPR ≤ 0.725 log_loss = 0.672 samples = 17 value = [0, 0, 14, 3] 21->23 24 log_loss = 0.0 samples = 14 value = [0, 0, 14, 0] 23->24 25 log_loss = 0.0 samples = 3 value = [0, 0, 0, 3] 23->25
--------------------------------------
|      Decision Tree Description     |
--------------------------------------
The binary tree structure has 27 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 4] <= 0.41499999165534973 else to node 20.
	node=1 is a split node: go to node 2 if X[:, 4] <= 0.13499999791383743 else to node 5.
		node=2 is a split node: go to node 3 if X[:, 3] <= 0.6649999916553497 else to node 4.
			node=3 is a leaf node.
			node=4 is a leaf node.
		node=5 is a split node: go to node 6 if X[:, 4] <= 0.2549999952316284 else to node 15.
			node=6 is a split node: go to node 7 if X[:, 2] <= 0.3050000071525574 else to node 10.
				node=7 is a split node: go to node 8 if X[:, 0] <= 0.41999998688697815 else to node 9.
					node=8 is a leaf node.
					node=9 is a leaf node.
				node=10 is a split node: go to node 11 if X[:, 2] <= 0.6299999952316284 else to node 12.
					node=11 is a leaf node.
					node=12 is a split node: go to node 13 if X[:, 4] <= 0.19500000029802322 else to node 14.
						node=13 is a leaf node.
						node=14 is a leaf node.
			node=15 is a split node: go to node 16 if X[:, 3] <= 0.5550000071525574 else to node 17.
				node=16 is a leaf node.
				node=17 is a split node: go to node 18 if X[:, 0] <= 0.1850000061094761 else to node 19.
					node=18 is a leaf node.
					node=19 is a leaf node.
	node=20 is a split node: go to node 21 if X[:, 4] <= 0.6899999976158142 else to node 26.
		node=21 is a split node: go to node 22 if X[:, 2] <= 0.4699999988079071 else to node 23.
			node=22 is a leaf node.
			node=23 is a split node: go to node 24 if X[:, 3] <= 0.7249999940395355 else to node 25.
				node=24 is a leaf node.
				node=25 is a leaf node.
		node=26 is a leaf node.

--------------------------------------
|     ccp_alphas for Post Pruning     |
--------------------------------------

[0.         0.00318124 0.00345865 0.00492899 0.00564279 0.01053307
 0.01111111 0.01125547 0.01193223 0.01771874 0.02599923 0.02899021
 0.03215664 0.06407498 0.09761783 0.18459135 0.35301946 0.80604746]
--------------------------
|      Best Estimator     |
--------------------------

	DecisionTreeClassifier(criterion='log_loss', max_depth=10, max_features='auto',
                       min_samples_leaf=2, random_state=0)

--------------------------
|     Best parameters     |
--------------------------
	Parameters of best estimator : 

	{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'log_loss', 'max_depth': 10, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 2, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 0, 'splitter': 'best'}

---------------------------------
|   No of CrossValidation sets   |
--------------------------------

	Total number of cross validation sets: 3

--------------------------
|        Best Score       |
--------------------------

	Average Cross Validate scores of best estimator : 

	0.9416666666666668

9.2 Compare performance of models after Pre-Pruning

In [1207]:
dFFeature_Subset_Result.drop(dFFeature_Subset_Result.index, inplace=True)
In [1208]:
for item in Feature_Subset_Result:
    dFFeature_Subset_Result.loc[dFFeature_Subset_Result.shape[0]] = item
In [1209]:
print(dFFeature_Subset_Result[['Best Estimator','Training Set Accuracy', 'Testing Set Accuracy', 'Training Time', 'Testing Time',
                                                'Avg. Cross Validation Score of Best Estimator']].to_markdown(tablefmt="pretty"))
+----+----------------------------------------------------------------------------------+-----------------------+----------------------+------------------------+------------------------+-----------------------------------------------+
|    |                                  Best Estimator                                  | Training Set Accuracy | Testing Set Accuracy |     Training Time      |      Testing Time      | Avg. Cross Validation Score of Best Estimator |
+----+----------------------------------------------------------------------------------+-----------------------+----------------------+------------------------+------------------------+-----------------------------------------------+
| 0  |       DecisionTreeClassifier(class_weight='balanced', max_features='auto',       |          1.0          |  0.8833333333333333  | 0 days 00:00:08.519116 | 0 days 00:00:00.006995 |               0.906459404706249               |
|    |                                             random_state=0)                      |                       |                      |                        |                        |                                               |
| 1  |           DecisionTreeClassifier(max_features='auto', random_state=0)            |          1.0          |  0.8916666666666667  | 0 days 00:00:00.327816 | 0 days 00:00:00.006993 |              0.9277777777777777               |
| 2  |                                     chef.fit                                     |        0.7086         |        0.650         | 0 days 00:00:15.182259 | 0 days 00:00:00.094945 |                                               |
| 3  |                                     chef.fit                                     |        0.6055         |        0.6583        | 0 days 00:00:14.784230 | 0 days 00:00:00.056966 |                                               |
| 4  |       DecisionTreeClassifier(class_weight='balanced', criterion='entropy',       |          1.0          |  0.9166666666666666  | 0 days 00:00:00.241866 | 0 days 00:00:00.004998 |              0.9246922237805828               |
|    |                                  max_features='auto', random_state=0)            |                       |                      |                        |                        |                                               |
| 5  | DecisionTreeClassifier(criterion='entropy', max_features='auto', random_state=0) |          1.0          |  0.9083333333333333  | 0 days 00:00:00.290833 | 0 days 00:00:00.008995 |              0.9222222222222222               |
| 6  |      DecisionTreeClassifier(class_weight='balanced', criterion='log_loss',       |          1.0          |  0.9166666666666666  | 0 days 00:00:00.299831 | 0 days 00:00:00.007997 |              0.9246922237805828               |
|    |                                  max_features='auto', random_state=0)            |                       |                      |                        |                        |                                               |
| 7  |        DecisionTreeClassifier(criterion='log_loss', max_features='auto',         |          1.0          |  0.9083333333333333  | 0 days 00:00:00.330811 | 0 days 00:00:00.004999 |              0.9222222222222222               |
|    |                                             random_state=0)                      |                       |                      |                        |                        |                                               |
| 8  |           DecisionTreeClassifier(class_weight='balanced', max_depth=8,           |  0.9928057553956835   |        0.925         | 0 days 00:00:05.274972 | 0 days 00:00:00.004999 |              0.9137447405329593               |
|    |                                  max_features='auto', random_state=0)            |                       |                      |                        |                        |                                               |
| 9  |  DecisionTreeClassifier(max_depth=8, max_features='auto', min_samples_split=3,   |  0.9944444444444445   |  0.8916666666666667  | 0 days 00:00:06.758121 | 0 days 00:00:00.007996 |              0.9305555555555555               |
|    |                                             random_state=0)                      |                       |                      |                        |                        |                                               |
| 10 |       DecisionTreeClassifier(class_weight='balanced', criterion='entropy',       |  0.9964028776978417   |  0.9166666666666666  | 0 days 00:00:05.407894 | 0 days 00:00:00.009995 |              0.9426133707339878               |
|    |                            max_depth=8, max_features='auto', random_state=0)     |                       |                      |                        |                        |                                               |
| 11 |  DecisionTreeClassifier(criterion='entropy', max_depth=10, max_features='auto',  |  0.9833333333333333   |  0.9083333333333333  | 0 days 00:00:07.005537 | 0 days 00:00:00.007997 |              0.9416666666666668               |
|    |                                   min_samples_leaf=2, random_state=0)            |                       |                      |                        |                        |                                               |
| 12 |      DecisionTreeClassifier(class_weight='balanced', criterion='log_loss',       |  0.9964028776978417   |  0.9166666666666666  | 0 days 00:00:05.210007 | 0 days 00:00:00.005997 |              0.9426133707339878               |
|    |                            max_depth=8, max_features='auto', random_state=0)     |                       |                      |                        |                        |                                               |
| 13 | DecisionTreeClassifier(criterion='log_loss', max_depth=10, max_features='auto',  |  0.9833333333333333   |  0.9083333333333333  | 0 days 00:00:04.892190 | 0 days 00:00:00.004997 |              0.9416666666666668               |
|    |                                   min_samples_leaf=2, random_state=0)            |                       |                      |                        |                        |                                               |
+----+----------------------------------------------------------------------------------+-----------------------+----------------------+------------------------+------------------------+-----------------------------------------------+

B) Best scores using Original and Balance Data after pre-pruning

  • Best Scores with Gini Index using Gini @ DecisionTreeClassifier
    • Testing Set Accuracy: 0.9250
  • Best Scores with Information Gain using Log_Loss @ DecisionTreeClassifier
    • Testing Set Accuracy: 0.9166

A) Best scores using Original and Balance Data before any pruning

  • Best Scores with Gini Index using Gini @ DecisionTreeClassifier
    • Testing Set Accuracy: 0.8916
  • Best Scores with Gain Ratio using ChefBoost
    • Testing Set Accuracy: 0.6583
  • Best Scores with Information Gain using Log_Loss @ DecisionTreeClassifier
    • Testing Set Accuracy: 0.9166

Conclusion: There is overfitting in the models have reduced due to reduced training set accuracies and increased testing set accuracies

9.3 Post-Pruning Method using Cost Complexity Pruning

Decision trees can easily overfit. One way to avoid it is to limit the growth of trees by setting constrains during Pre-Pruning. We can limit parameters like max_depth , min_samples etc.

But a most effective way is to use post pruning methods like cost complexity pruning. This helps to improve test accuracy and get a better model.

Cost complexity pruning is all about finding the right parameter for alpha.We will get the alpha values for this tree and will check the accuracy with the pruned trees.

9.3.1 Post-Pruning Method: Calculate the best Decision Tree Model using GridSearchCV using Gini Criterion with Original Data

In [1210]:
#X_train = dFFeature_Subset_Result.iloc[0]['X_train']
#y_train = dFFeature_Subset_Result.iloc[0]['y_train']
#X_test = dFFeature_Subset_Result.iloc[0]['X_test']
#y_test = dFFeature_Subset_Result.iloc[0]['y_test']
ccp_alphas = dFFeature_Subset_Result.iloc[0]['ccp_alphas']

# For each alpha we will append our model to a list
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=0, class_weight='balanced', max_features='auto', ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)

#We will remove the last element in clfs and ccp_alphas, because it is the trivial tree with only one node.
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
plt.scatter(ccp_alphas,node_counts)
plt.scatter(ccp_alphas,depth)
plt.plot(ccp_alphas,node_counts,label='no of nodes',drawstyle="steps-post")
plt.plot(ccp_alphas,depth,label='depth',drawstyle="steps-post")
plt.legend()
plt.show()

train_acc = []
test_acc = []
for c in clfs:
    y_train_pred = c.predict(X_train)
    y_test_pred = c.predict(X_test)
    train_acc.append(accuracy_score(y_train_pred,y_train))
    test_acc.append(accuracy_score(y_test_pred,y_test))

plt.scatter(ccp_alphas,train_acc)
plt.scatter(ccp_alphas,test_acc)
plt.plot(ccp_alphas,train_acc,label='train_accuracy',drawstyle="steps-post")
plt.plot(ccp_alphas,test_acc,label='test_accuracy',drawstyle="steps-post")
plt.legend()
plt.title('Accuracy vs alpha')
plt.show()

df = pd.DataFrame({"ccp_alphas": ccp_alphas, "Train Acc.": train_acc,  "Test Acc.": test_acc, "No. of Nodes": node_counts, "Depth": depth, "Accuracy Diff.": map(sub, train_acc, test_acc)})
print("Print Sorted Dataframe on Accuracy Difference between Train Set Accuracy and Test Set Accuracy\n")
print(df.sort_values(by=['Accuracy Diff.']))
Print Sorted Dataframe on Accuracy Difference between Train Set Accuracy and Test Set Accuracy

      ccp_alphas  Train Acc.  Test Acc.  No. of Nodes  Depth  Accuracy Diff.
24  1.698469e-01    0.370504   0.366667             3      1        0.003837
22  3.481254e-02    0.877698   0.866667             9      3        0.011031
23  9.508697e-02    0.647482   0.633333             5      2        0.014149
21  1.481786e-02    0.910072   0.891667            11      3        0.018405
20  1.052027e-02    0.913669   0.883333            13      4        0.030336
19  8.064765e-03    0.920863   0.883333            15      4        0.037530
18  7.540465e-03    0.924460   0.883333            19      4        0.041127
12  4.132231e-03    0.960432   0.891667            35      8        0.068765
16  5.190311e-03    0.935252   0.858333            25      6        0.076918
17  6.138727e-03    0.935252   0.858333            23      6        0.076918
14  4.651163e-03    0.946043   0.866667            29      6        0.079376
15  4.713382e-03    0.938849   0.858333            27      6        0.080516
13  4.601393e-03    0.949640   0.866667            31      6        0.082974
11  3.683072e-03    0.967626   0.883333            39     10        0.084293
10  2.873563e-03    0.974820   0.883333            45     10        0.091487
9   2.869440e-03    0.978417   0.883333            49     10        0.095084
8   2.800000e-03    0.982014   0.883333            53     10        0.098681
7   2.518056e-03    0.985612   0.883333            57     10        0.102278
6   2.427184e-03    0.992806   0.883333            67     10        0.109472
5   2.112676e-03    0.996403   0.883333            71     10        0.113070
4   1.240837e-17    1.000000   0.883333            75     10        0.116667
3   3.700743e-18    1.000000   0.883333            77     10        0.116667
2   2.612289e-18    1.000000   0.883333            79     10        0.116667
1   1.959217e-18    1.000000   0.883333            81     10        0.116667
0   0.000000e+00    1.000000   0.883333            83     11        0.116667
In [1211]:
# start Grid search with Gini Criterion
parameters = {'criterion': ['gini'], 
              'max_features':['auto'],
              'random_state':[0],
              'class_weight':['balanced'],
              'ccp_alpha':[1.481786e-02]}
des_tree_gini_PostPruning = DecisionTreeClassifier()
des_tree_gini_grid_PostPruning = GridSearchCV(des_tree_gini_PostPruning, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_gini_grid_results_PostPruning = perform_model(des_tree_gini_grid_PostPruning, X_train, y_train, X_test, y_test, labels)
# observe the attributes of the model 
print_grid_search_attributes(des_tree_gini_grid_results_PostPruning)
training the model..
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Done 
 

---------------------
|   Training Time    |
---------------------
training_time(HH:MM:SS.ms) - 0:00:00.047974


Predicting test data
Done 
 

---------------------
|   Testing Time    |
---------------------
testing time(HH:MM:SS:ms) - 0:00:00.006990


---------------------
| Training Set Accuracy |
---------------------

    0.9136690647482014


---------------------
| Testing Set Accuracy  |
---------------------

    0.8833333333333333


-----------------------------
| Train Set Confusion Matrix |
-----------------------------

 [[34  1  0  0]
 [ 1 88  1  0]
 [ 0 15 65  5]
 [ 0  0  1 67]]

-----------------------------
| Test Set Confusion Matrix |
-----------------------------

 [[12  3  0  0]
 [ 2 37  0  0]
 [ 0  7 28  2]
 [ 0  0  0 29]]
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
-------------------------
| Classification Report |
-------------------------
              precision    recall  f1-score   support

           0       0.86      0.80      0.83        15
           1       0.79      0.95      0.86        39
           2       1.00      0.76      0.86        37
           3       0.94      1.00      0.97        29

    accuracy                           0.88       120
   macro avg       0.89      0.88      0.88       120
weighted avg       0.90      0.88      0.88       120

-------------------------
| ROC Curve |
-------------------------
------------------------------------
|   Decision Tree via. Plot_tree   |
------------------------------------
------------------------------------
|   Decision Tree via. graphviz   |
------------------------------------
Tree 0 PEG ≤ 0.69 gini = 0.75 samples = 120 value = [30.0, 30.0, 30.0, 30.0] 1 PEG ≤ 0.335 gini = 0.687 samples = 94 value = [30.0, 30.0, 30.0, 3.103] 0->1 True 16 gini = -0.0 samples = 26 value = [0.0, 0.0, 0.0, 26.897] 0->16 False 2 LPR ≤ 0.265 gini = 0.567 samples = 59 value = [30.0, 29.231, 4.865, 0.0] 1->2 13 LPR ≤ 0.855 gini = 0.237 samples = 35 value = [0.0, 0.769, 25.135, 3.103] 1->13 3 PEG ≤ 0.27 gini = 0.316 samples = 18 value = [22.0, 5.385, 0.0, 0.0] 2->3 6 PEG ≤ 0.13 gini = 0.513 samples = 41 value = [8.0, 23.846, 4.865, 0.0] 2->6 4 gini = 0.065 samples = 12 value = [22.0, 0.769, 0.0, 0.0] 3->4 5 gini = 0.0 samples = 6 value = [0.0, 4.615, 0.0, 0.0] 3->5 7 LPR ≤ 0.665 gini = 0.271 samples = 6 value = [8.0, 1.538, 0.0, 0.0] 6->7 10 LPR ≤ 0.79 gini = 0.294 samples = 35 value = [0.0, 22.308, 4.865, 0.0] 6->10 8 gini = 0.0 samples = 4 value = [8, 0, 0, 0] 7->8 9 gini = 0.0 samples = 2 value = [0.0, 1.538, 0.0, 0.0] 7->9 11 gini = 0.182 samples = 31 value = [0.0, 21.538, 2.432, 0.0] 10->11 12 gini = 0.365 samples = 4 value = [0.0, 0.769, 2.432, 0.0] 10->12 14 gini = 0.058 samples = 32 value = [0.0, 0.769, 25.135, 0.0] 13->14 15 gini = 0.0 samples = 3 value = [0.0, 0.0, 0.0, 3.103] 13->15
--------------------------------------
|      Decision Tree Description     |
--------------------------------------
The binary tree structure has 17 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 4] <= 0.6899999976158142 else to node 16.
	node=1 is a split node: go to node 2 if X[:, 4] <= 0.33500000834465027 else to node 13.
		node=2 is a split node: go to node 3 if X[:, 3] <= 0.26500000059604645 else to node 6.
			node=3 is a split node: go to node 4 if X[:, 4] <= 0.26999999582767487 else to node 5.
				node=4 is a leaf node.
				node=5 is a leaf node.
			node=6 is a split node: go to node 7 if X[:, 4] <= 0.12999999895691872 else to node 10.
				node=7 is a split node: go to node 8 if X[:, 3] <= 0.6649999916553497 else to node 9.
					node=8 is a leaf node.
					node=9 is a leaf node.
				node=10 is a split node: go to node 11 if X[:, 3] <= 0.7899999916553497 else to node 12.
					node=11 is a leaf node.
					node=12 is a leaf node.
		node=13 is a split node: go to node 14 if X[:, 3] <= 0.8549999892711639 else to node 15.
			node=14 is a leaf node.
			node=15 is a leaf node.
	node=16 is a leaf node.

--------------------------------------
|     ccp_alphas for Post Pruning     |
--------------------------------------

[0.00000000e+00 1.95921710e-18 2.61228947e-18 3.70074342e-18
 1.24083750e-17 2.11267606e-03 2.42718447e-03 2.51805559e-03
 2.80000000e-03 2.86944046e-03 2.87356322e-03 3.68307155e-03
 4.13223140e-03 4.60139305e-03 4.65116279e-03 4.71338240e-03
 5.19031142e-03 6.13872706e-03 7.54046548e-03 8.06476542e-03
 1.05202714e-02 1.48178638e-02 3.48125409e-02 9.50869721e-02
 1.69846939e-01 2.10651925e-01]
--------------------------
|      Best Estimator     |
--------------------------

	DecisionTreeClassifier(ccp_alpha=0.01481786, class_weight='balanced',
                       max_features='auto', random_state=0)

--------------------------
|     Best parameters     |
--------------------------
	Parameters of best estimator : 

	{'ccp_alpha': 0.01481786, 'class_weight': 'balanced', 'criterion': 'gini', 'max_features': 'auto', 'random_state': 0}

---------------------------------
|   No of CrossValidation sets   |
--------------------------------

	Total number of cross validation sets: 3

--------------------------
|        Best Score       |
--------------------------

	Average Cross Validate scores of best estimator : 

	0.9065373227364812

9.3.2 Post-Pruning Method: Calculate the best Decision Tree Model using GridSearchCV using Gini Criterion with Balanced Data using SMOTE

In [1212]:
#X_train = dFFeature_Subset_Result.iloc[1]['X_train']
#y_train = dFFeature_Subset_Result.iloc[1]['y_train']
#X_test = dFFeature_Subset_Result.iloc[1]['X_test']
#y_test = dFFeature_Subset_Result.iloc[1]['y_test']
ccp_alphas = dFFeature_Subset_Result.iloc[1]['ccp_alphas']

# For each alpha we will append our model to a list
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=0, class_weight='balanced', max_features='auto', ccp_alpha=ccp_alpha)
    clf.fit(X_train_smote, y_train_smote)
    clfs.append(clf)

#We will remove the last element in clfs and ccp_alphas, because it is the trivial tree with only one node.
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
plt.scatter(ccp_alphas,node_counts)
plt.scatter(ccp_alphas,depth)
plt.plot(ccp_alphas,node_counts,label='no of nodes',drawstyle="steps-post")
plt.plot(ccp_alphas,depth,label='depth',drawstyle="steps-post")
plt.legend()
plt.show()

train_acc = []
test_acc = []
for c in clfs:
    y_train_pred = c.predict(X_train_smote)
    y_test_pred = c.predict(X_test)
    train_acc.append(accuracy_score(y_train_pred,y_train_smote))
    test_acc.append(accuracy_score(y_test_pred,y_test))

plt.scatter(ccp_alphas,train_acc)
plt.scatter(ccp_alphas,test_acc)
plt.plot(ccp_alphas,train_acc,label='train_accuracy',drawstyle="steps-post")
plt.plot(ccp_alphas,test_acc,label='test_accuracy',drawstyle="steps-post")
plt.legend()
plt.title('Accuracy vs alpha')
plt.show()

df = pd.DataFrame({"ccp_alphas": ccp_alphas, "Train Acc.": train_acc,  "Test Acc.": test_acc, "No. of Nodes": node_counts, "Depth": depth, "Accuracy Diff.": map(sub, train_acc, test_acc)})
print("Print Sorted Dataframe on Accuracy Difference between Train Set Accuracy and Test Set Accuracy\n")
print(df.sort_values(by=['Accuracy Diff.']))
Print Sorted Dataframe on Accuracy Difference between Train Set Accuracy and Test Set Accuracy

    ccp_alphas  Train Acc.  Test Acc.  No. of Nodes  Depth  Accuracy Diff.
19    0.014468    0.894444   0.875000            23      8        0.019444
15    0.007215    0.925000   0.891667            35      8        0.033333
14    0.007170    0.933333   0.900000            37      8        0.033333
13    0.007091    0.941667   0.908333            41      8        0.033333
18    0.010825    0.905556   0.866667            25      8        0.038889
17    0.009428    0.916667   0.866667            29      8        0.050000
16    0.008660    0.922222   0.866667            31      8        0.055556
22    0.033381    0.722222   0.658333             7      3        0.063889
12    0.006667    0.950000   0.883333            45      8        0.066667
7     0.004167    0.977778   0.908333            59      9        0.069444
20    0.014498    0.869444   0.800000            17      6        0.069444
6     0.003968    0.980556   0.908333            61     10        0.072222
23    0.035084    0.705556   0.633333             5      2        0.072222
11    0.004938    0.955556   0.883333            47      8        0.072222
10    0.004630    0.958333   0.883333            49      9        0.075000
9     0.004583    0.961111   0.883333            51      9        0.077778
21    0.021054    0.811111   0.725000            11      3        0.086111
5     0.003704    0.986111   0.900000            65     10        0.086111
8     0.004579    0.969444   0.883333            55      9        0.086111
4     0.003307    0.988889   0.900000            67     10        0.088889
3     0.002679    0.991667   0.900000            69     10        0.091667
2     0.002623    0.994444   0.900000            73     10        0.094444
1     0.002315    0.997222   0.891667            77     10        0.105556
0     0.000000    1.000000   0.891667            81     10        0.108333
24    0.163556    0.500000   0.366667             3      1        0.133333
In [1213]:
# start Grid search with Gini Criterion
parameters = {'criterion': ['gini'], 
              'max_features':['auto'],
              'random_state':[0],
              'class_weight':['balanced'],
              'ccp_alpha':[0.014468]}
des_tree_gini_PostPruning = DecisionTreeClassifier()
des_tree_gini_grid_PostPruning = GridSearchCV(des_tree_gini_PostPruning, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_gini_grid_results_PostPruning = perform_model(des_tree_gini_grid_PostPruning, X_train_smote, y_train_smote, X_test, y_test, labels)
# observe the attributes of the model 
print_grid_search_attributes(des_tree_gini_grid_results_PostPruning)
training the model..
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Done 
 

---------------------
|   Training Time    |
---------------------
training_time(HH:MM:SS.ms) - 0:00:00.053970


Predicting test data
Done 
 

---------------------
|   Testing Time    |
---------------------
testing time(HH:MM:SS:ms) - 0:00:00.005996


---------------------
| Training Set Accuracy |
---------------------

    0.8944444444444445


---------------------
| Testing Set Accuracy  |
---------------------

    0.875


-----------------------------
| Train Set Confusion Matrix |
-----------------------------

 [[81  9  0  0]
 [ 0 89  0  1]
 [ 0 16 62 12]
 [ 0  0  0 90]]

-----------------------------
| Test Set Confusion Matrix |
-----------------------------

 [[12  3  0  0]
 [ 1 38  0  0]
 [ 0  7 27  3]
 [ 0  0  1 28]]
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
-------------------------
| Classification Report |
-------------------------
              precision    recall  f1-score   support

           0       0.92      0.80      0.86        15
           1       0.79      0.97      0.87        39
           2       0.96      0.73      0.83        37
           3       0.90      0.97      0.93        29

    accuracy                           0.88       120
   macro avg       0.90      0.87      0.87       120
weighted avg       0.89      0.88      0.87       120

-------------------------
| ROC Curve |
-------------------------
------------------------------------
|   Decision Tree via. Plot_tree   |
------------------------------------
------------------------------------
|   Decision Tree via. graphviz   |
------------------------------------
Tree 0 PEG ≤ 0.69 gini = 0.75 samples = 120 value = [30.0, 30.0, 30.0, 30.0] 1 PEG ≤ 0.335 gini = 0.687 samples = 94 value = [30.0, 30.0, 30.0, 3.103] 0->1 True 16 gini = -0.0 samples = 26 value = [0.0, 0.0, 0.0, 26.897] 0->16 False 2 LPR ≤ 0.265 gini = 0.567 samples = 59 value = [30.0, 29.231, 4.865, 0.0] 1->2 13 LPR ≤ 0.855 gini = 0.237 samples = 35 value = [0.0, 0.769, 25.135, 3.103] 1->13 3 PEG ≤ 0.27 gini = 0.316 samples = 18 value = [22.0, 5.385, 0.0, 0.0] 2->3 6 PEG ≤ 0.13 gini = 0.513 samples = 41 value = [8.0, 23.846, 4.865, 0.0] 2->6 4 gini = 0.065 samples = 12 value = [22.0, 0.769, 0.0, 0.0] 3->4 5 gini = 0.0 samples = 6 value = [0.0, 4.615, 0.0, 0.0] 3->5 7 LPR ≤ 0.665 gini = 0.271 samples = 6 value = [8.0, 1.538, 0.0, 0.0] 6->7 10 LPR ≤ 0.79 gini = 0.294 samples = 35 value = [0.0, 22.308, 4.865, 0.0] 6->10 8 gini = 0.0 samples = 4 value = [8, 0, 0, 0] 7->8 9 gini = 0.0 samples = 2 value = [0.0, 1.538, 0.0, 0.0] 7->9 11 gini = 0.182 samples = 31 value = [0.0, 21.538, 2.432, 0.0] 10->11 12 gini = 0.365 samples = 4 value = [0.0, 0.769, 2.432, 0.0] 10->12 14 gini = 0.058 samples = 32 value = [0.0, 0.769, 25.135, 0.0] 13->14 15 gini = 0.0 samples = 3 value = [0.0, 0.0, 0.0, 3.103] 13->15
--------------------------------------
|      Decision Tree Description     |
--------------------------------------
The binary tree structure has 17 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 4] <= 0.6899999976158142 else to node 16.
	node=1 is a split node: go to node 2 if X[:, 4] <= 0.33500000834465027 else to node 13.
		node=2 is a split node: go to node 3 if X[:, 3] <= 0.26500000059604645 else to node 6.
			node=3 is a split node: go to node 4 if X[:, 4] <= 0.26999999582767487 else to node 5.
				node=4 is a leaf node.
				node=5 is a leaf node.
			node=6 is a split node: go to node 7 if X[:, 4] <= 0.12999999895691872 else to node 10.
				node=7 is a split node: go to node 8 if X[:, 3] <= 0.6649999916553497 else to node 9.
					node=8 is a leaf node.
					node=9 is a leaf node.
				node=10 is a split node: go to node 11 if X[:, 3] <= 0.7899999916553497 else to node 12.
					node=11 is a leaf node.
					node=12 is a leaf node.
		node=13 is a split node: go to node 14 if X[:, 3] <= 0.8549999892711639 else to node 15.
			node=14 is a leaf node.
			node=15 is a leaf node.
	node=16 is a leaf node.

--------------------------------------
|     ccp_alphas for Post Pruning     |
--------------------------------------

[0.         0.00231481 0.00262346 0.00267857 0.00330688 0.0037037
 0.00396825 0.00416667 0.00457875 0.00458333 0.00462963 0.00493827
 0.00666667 0.00709085 0.00716963 0.00721501 0.00865973 0.00942761
 0.01082506 0.01446759 0.0144977  0.02105404 0.03338142 0.03508376
 0.16355609 0.21043512]
--------------------------
|      Best Estimator     |
--------------------------

	DecisionTreeClassifier(ccp_alpha=0.014468, class_weight='balanced',
                       max_features='auto', random_state=0)

--------------------------
|     Best parameters     |
--------------------------
	Parameters of best estimator : 

	{'ccp_alpha': 0.014468, 'class_weight': 'balanced', 'criterion': 'gini', 'max_features': 'auto', 'random_state': 0}

---------------------------------
|   No of CrossValidation sets   |
--------------------------------

	Total number of cross validation sets: 3

--------------------------
|        Best Score       |
--------------------------

	Average Cross Validate scores of best estimator : 

	0.8972222222222221

9.3.3 Post-Pruning Method: Calculate the best Decision Tree Model using GridSearchCV using Entropy Criterion with Original Data

In [1214]:
#X_train = dFFeature_Subset_Result.iloc[4]['X_train']
#y_train = dFFeature_Subset_Result.iloc[4]['y_train']
#X_test = dFFeature_Subset_Result.iloc[4]['X_test']
#y_test = dFFeature_Subset_Result.iloc[4]['y_test']
ccp_alphas = dFFeature_Subset_Result.iloc[4]['ccp_alphas']

# For each alpha we will append our model to a list
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(criterion='entropy', random_state=0, class_weight='balanced', max_features='auto', ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)

#We will remove the last element in clfs and ccp_alphas, because it is the trivial tree with only one node.
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
plt.scatter(ccp_alphas,node_counts)
plt.scatter(ccp_alphas,depth)
plt.plot(ccp_alphas,node_counts,label='no of nodes',drawstyle="steps-post")
plt.plot(ccp_alphas,depth,label='depth',drawstyle="steps-post")
plt.legend()
plt.show()

train_acc = []
test_acc = []
for c in clfs:
    y_train_pred = c.predict(X_train)
    y_test_pred = c.predict(X_test)
    train_acc.append(accuracy_score(y_train_pred,y_train))
    test_acc.append(accuracy_score(y_test_pred,y_test))

plt.scatter(ccp_alphas,train_acc)
plt.scatter(ccp_alphas,test_acc)
plt.plot(ccp_alphas,train_acc,label='train_accuracy',drawstyle="steps-post")
plt.plot(ccp_alphas,test_acc,label='test_accuracy',drawstyle="steps-post")
plt.legend()
plt.title('Accuracy vs alpha')
plt.show()

df = pd.DataFrame({"ccp_alphas": ccp_alphas, "Train Acc.": train_acc,  "Test Acc.": test_acc, "No. of Nodes": node_counts, "Depth": depth, "Accuracy Diff.": map(sub, train_acc, test_acc)})
print("Print Sorted Dataframe on Accuracy Difference between Train Set Accuracy and Test Set Accuracy\n")
print(df.sort_values(by=['Accuracy Diff.']))
Print Sorted Dataframe on Accuracy Difference between Train Set Accuracy and Test Set Accuracy

      ccp_alphas  Train Acc.  Test Acc.  No. of Nodes  Depth  Accuracy Diff.
18  1.390813e-01    0.848921   0.850000             7      2       -0.001079
20  3.692640e-01    0.568345   0.566667             3      1        0.001679
17  9.764122e-02    0.888489   0.875000             9      3        0.013489
19  2.581308e-01    0.651079   0.633333             5      2        0.017746
14  2.954752e-02    0.960432   0.941667            17      5        0.018765
16  6.736392e-02    0.920863   0.900000            11      3        0.020863
15  3.133897e-02    0.928058   0.900000            15      4        0.028058
13  1.685404e-02    0.964029   0.933333            19      5        0.030695
10  1.093675e-02    0.978417   0.925000            27      7        0.053417
12  1.323572e-02    0.964029   0.908333            21      6        0.055695
9   8.632911e-03    0.982014   0.925000            29      7        0.057014
8   8.486615e-03    0.982014   0.925000            31      7        0.057014
11  1.121405e-02    0.967626   0.908333            25      6        0.059293
7   7.433563e-03    0.985612   0.925000            33      7        0.060612
6   6.113439e-03    0.989209   0.925000            39      7        0.064209
5   5.715586e-03    0.992806   0.925000            41      8        0.067806
4   5.625219e-03    0.996403   0.916667            43      8        0.079736
3   3.048663e-16    1.000000   0.916667            47     10        0.083333
2   2.353059e-16    1.000000   0.916667            49     10        0.083333
1   1.943255e-17    1.000000   0.916667            51     10        0.083333
0   0.000000e+00    1.000000   0.916667            53     10        0.083333
In [1215]:
# start Grid search with Entropy Criterion
parameters = {'criterion': ['entropy'], 
              'max_features':['auto'],
              'random_state':[0],
              'class_weight':['balanced'],
              'ccp_alpha':[2.954752e-02]}
des_tree_entropy_PostPruning = DecisionTreeClassifier()
des_tree_entropy_grid_PostPruning = GridSearchCV(des_tree_entropy_PostPruning, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_entropy_grid_results_PostPruning = perform_model(des_tree_entropy_grid_PostPruning, X_train, y_train, X_test, y_test, labels)
# observe the attributes of the model 
print_grid_search_attributes(des_tree_entropy_grid_results_PostPruning)
training the model..
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Done 
 

---------------------
|   Training Time    |
---------------------
training_time(HH:MM:SS.ms) - 0:00:00.049971


Predicting test data
Done 
 

---------------------
|   Testing Time    |
---------------------
testing time(HH:MM:SS:ms) - 0:00:00.007994


---------------------
| Training Set Accuracy |
---------------------

    0.9640287769784173


---------------------
| Testing Set Accuracy  |
---------------------

    0.9333333333333333


-----------------------------
| Train Set Confusion Matrix |
-----------------------------

 [[35  0  0  0]
 [ 2 87  1  0]
 [ 0  5 79  1]
 [ 0  0  1 67]]

-----------------------------
| Test Set Confusion Matrix |
-----------------------------

 [[12  3  0  0]
 [ 2 37  0  0]
 [ 0  2 34  1]
 [ 0  0  0 29]]
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
-------------------------
| Classification Report |
-------------------------
              precision    recall  f1-score   support

           0       0.86      0.80      0.83        15
           1       0.88      0.95      0.91        39
           2       1.00      0.92      0.96        37
           3       0.97      1.00      0.98        29

    accuracy                           0.93       120
   macro avg       0.93      0.92      0.92       120
weighted avg       0.94      0.93      0.93       120

-------------------------
| ROC Curve |
-------------------------
------------------------------------
|   Decision Tree via. Plot_tree   |
------------------------------------
------------------------------------
|   Decision Tree via. graphviz   |
------------------------------------
Tree 0 PEG ≤ 0.415 entropy = 2.0 samples = 120 value = [30.0, 30.0, 30.0, 30.0] 1 PEG ≤ 0.135 entropy = 1.338 samples = 61 value = [30.0, 30.0, 5.676, 0.0] 0->1 True 12 LPR ≤ 0.135 entropy = 0.992 samples = 59 value = [0.0, 0.0, 24.324, 30.0] 0->12 False 2 LPR ≤ 0.665 entropy = 0.328 samples = 14 value = [24.0, 1.538, 0.0, 0.0] 1->2 5 PEG ≤ 0.255 entropy = 1.161 samples = 47 value = [6.0, 28.462, 5.676, 0.0] 1->5 3 entropy = 0.0 samples = 12 value = [24, 0, 0, 0] 2->3 4 entropy = 0.0 samples = 2 value = [0.0, 1.538, 0.0, 0.0] 2->4 6 STG ≤ 0.35 entropy = 0.927 samples = 18 value = [6.0, 11.538, 0.0, 0.0] 5->6 9 LPR ≤ 0.555 entropy = 0.813 samples = 29 value = [0.0, 16.923, 5.676, 0.0] 5->9 7 entropy = 0.965 samples = 8 value = [6.0, 3.846, 0.0, 0.0] 6->7 8 entropy = 0.0 samples = 10 value = [0.0, 7.692, 0.0, 0.0] 6->8 10 entropy = 0.0 samples = 21 value = [0.0, 16.154, 0.0, 0.0] 9->10 11 entropy = 0.528 samples = 8 value = [0.0, 0.769, 5.676, 0.0] 9->11 13 entropy = 0.0 samples = 7 value = [0.0, 0.0, 5.676, 0.0] 12->13 14 PEG ≤ 0.665 entropy = 0.96 samples = 52 value = [0.0, 0.0, 18.649, 30.0] 12->14 15 LPR ≤ 0.85 entropy = 0.469 samples = 25 value = [0.0, 0.0, 18.649, 2.069] 14->15 18 entropy = 0.0 samples = 27 value = [0.0, 0.0, 0.0, 27.931] 14->18 16 entropy = 0.0 samples = 23 value = [0.0, 0.0, 18.649, 0.0] 15->16 17 entropy = 0.0 samples = 2 value = [0.0, 0.0, 0.0, 2.069] 15->17
--------------------------------------
|      Decision Tree Description     |
--------------------------------------
The binary tree structure has 19 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 4] <= 0.41499999165534973 else to node 12.
	node=1 is a split node: go to node 2 if X[:, 4] <= 0.13499999791383743 else to node 5.
		node=2 is a split node: go to node 3 if X[:, 3] <= 0.6649999916553497 else to node 4.
			node=3 is a leaf node.
			node=4 is a leaf node.
		node=5 is a split node: go to node 6 if X[:, 4] <= 0.2549999952316284 else to node 9.
			node=6 is a split node: go to node 7 if X[:, 0] <= 0.3499999940395355 else to node 8.
				node=7 is a leaf node.
				node=8 is a leaf node.
			node=9 is a split node: go to node 10 if X[:, 3] <= 0.5550000071525574 else to node 11.
				node=10 is a leaf node.
				node=11 is a leaf node.
	node=12 is a split node: go to node 13 if X[:, 3] <= 0.13499999791383743 else to node 14.
		node=13 is a leaf node.
		node=14 is a split node: go to node 15 if X[:, 4] <= 0.6650000214576721 else to node 18.
			node=15 is a split node: go to node 16 if X[:, 3] <= 0.8500000238418579 else to node 17.
				node=16 is a leaf node.
				node=17 is a leaf node.
			node=18 is a leaf node.

--------------------------------------
|     ccp_alphas for Post Pruning     |
--------------------------------------

[0.00000000e+00 1.94325505e-17 2.35305905e-16 3.04866340e-16
 5.62521852e-03 5.71558615e-03 6.11343912e-03 7.43356304e-03
 8.48661518e-03 8.63291106e-03 1.09367460e-02 1.12140546e-02
 1.32357191e-02 1.68540384e-02 2.95475235e-02 3.13389697e-02
 6.73639191e-02 9.76412209e-02 1.39081296e-01 2.58130803e-01
 3.69263963e-01 8.12292430e-01]
--------------------------
|      Best Estimator     |
--------------------------

	DecisionTreeClassifier(ccp_alpha=0.02954752, class_weight='balanced',
                       criterion='entropy', max_features='auto',
                       random_state=0)

--------------------------
|     Best parameters     |
--------------------------
	Parameters of best estimator : 

	{'ccp_alpha': 0.02954752, 'class_weight': 'balanced', 'criterion': 'entropy', 'max_features': 'auto', 'random_state': 0}

---------------------------------
|   No of CrossValidation sets   |
--------------------------------

	Total number of cross validation sets: 3

--------------------------
|        Best Score       |
--------------------------

	Average Cross Validate scores of best estimator : 

	0.9066152407667133

9.3.4 Post-Pruning Method: Calculate the best Decision Tree Model using GridSearchCV using Entropy Criterion with Balanced Data using SMOTE

In [1216]:
#X_train = dFFeature_Subset_Result.iloc[5]['X_train']
#y_train = dFFeature_Subset_Result.iloc[5]['y_train']
#X_test = dFFeature_Subset_Result.iloc[5]['X_test']
#y_test = dFFeature_Subset_Result.iloc[5]['y_test']
ccp_alphas = dFFeature_Subset_Result.iloc[5]['ccp_alphas']

# For each alpha we will append our model to a list
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(criterion='entropy', random_state=0, class_weight='balanced', max_features='auto', ccp_alpha=ccp_alpha)
    clf.fit(X_train_smote, y_train_smote)
    clfs.append(clf)

#We will remove the last element in clfs and ccp_alphas, because it is the trivial tree with only one node.
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
plt.scatter(ccp_alphas,node_counts)
plt.scatter(ccp_alphas,depth)
plt.plot(ccp_alphas,node_counts,label='no of nodes',drawstyle="steps-post")
plt.plot(ccp_alphas,depth,label='depth',drawstyle="steps-post")
plt.legend()
plt.show()

train_acc = []
test_acc = []
for c in clfs:
    y_train_pred = c.predict(X_train_smote)
    y_test_pred = c.predict(X_test)
    train_acc.append(accuracy_score(y_train_pred,y_train_smote))
    test_acc.append(accuracy_score(y_test_pred,y_test))

plt.scatter(ccp_alphas,train_acc)
plt.scatter(ccp_alphas,test_acc)
plt.plot(ccp_alphas,train_acc,label='train_accuracy',drawstyle="steps-post")
plt.plot(ccp_alphas,test_acc,label='test_accuracy',drawstyle="steps-post")
plt.legend()
plt.title('Accuracy vs alpha')
plt.show()

df = pd.DataFrame({"ccp_alphas": ccp_alphas, "Train Acc.": train_acc,  "Test Acc.": test_acc, "No. of Nodes": node_counts, "Depth": depth, "Accuracy Diff.": map(sub, train_acc, test_acc)})
print("Print Sorted Dataframe on Accuracy Difference between Train Set Accuracy and Test Set Accuracy\n")
print(df.sort_values(by=['Accuracy Diff.']))
Print Sorted Dataframe on Accuracy Difference between Train Set Accuracy and Test Set Accuracy

    ccp_alphas  Train Acc.  Test Acc.  No. of Nodes  Depth  Accuracy Diff.
13    0.032157    0.922222   0.916667            15      4        0.005556
12    0.028990    0.933333   0.916667            17      4        0.016667
14    0.064075    0.911111   0.891667            11      3        0.019444
15    0.097618    0.894444   0.866667             9      3        0.027778
11    0.025999    0.944444   0.908333            19      4        0.036111
10    0.020946    0.947222   0.908333            21      4        0.038889
9     0.017719    0.950000   0.908333            23      4        0.041667
6     0.011111    0.977778   0.916667            31      6        0.061111
8     0.012581    0.975000   0.908333            27      6        0.066667
7     0.011255    0.977778   0.908333            29      6        0.069444
5     0.010834    0.983333   0.908333            33      7        0.075000
16    0.184591    0.708333   0.633333             5      2        0.075000
4     0.010533    0.986111   0.908333            35      7        0.077778
3     0.009014    0.991667   0.908333            39      8        0.083333
2     0.007652    0.994444   0.908333            41      9        0.086111
1     0.006313    0.997222   0.908333            43      9        0.088889
0     0.000000    1.000000   0.908333            49      9        0.091667
17    0.353019    0.500000   0.366667             3      1        0.133333
In [1217]:
# start Grid search with Entropy Criterion
parameters = {'criterion': ['entropy'], 
              'max_features':['auto'],
              'random_state':[0],
              'class_weight':['balanced'],
              'ccp_alpha':[0.032157]}
des_tree_entropy_PostPruning = DecisionTreeClassifier()
des_tree_entropy_grid_PostPruning = GridSearchCV(des_tree_entropy_PostPruning, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_entropy_grid_results_PostPruning = perform_model(des_tree_entropy_grid_PostPruning, X_train_smote, y_train_smote, X_test, y_test, labels)
# observe the attributes of the model 
print_grid_search_attributes(des_tree_entropy_grid_results_PostPruning)
training the model..
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Done 
 

---------------------
|   Training Time    |
---------------------
training_time(HH:MM:SS.ms) - 0:00:00.052968


Predicting test data
Done 
 

---------------------
|   Testing Time    |
---------------------
testing time(HH:MM:SS:ms) - 0:00:00.006996


---------------------
| Training Set Accuracy |
---------------------

    0.9222222222222223


---------------------
| Testing Set Accuracy  |
---------------------

    0.9166666666666666


-----------------------------
| Train Set Confusion Matrix |
-----------------------------

 [[81  9  0  0]
 [ 0 76 14  0]
 [ 0  1 86  3]
 [ 0  0  1 89]]

-----------------------------
| Test Set Confusion Matrix |
-----------------------------

 [[12  3  0  0]
 [ 1 36  2  0]
 [ 0  2 33  2]
 [ 0  0  0 29]]
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
-------------------------
| Classification Report |
-------------------------
              precision    recall  f1-score   support

           0       0.92      0.80      0.86        15
           1       0.88      0.92      0.90        39
           2       0.94      0.89      0.92        37
           3       0.94      1.00      0.97        29

    accuracy                           0.92       120
   macro avg       0.92      0.90      0.91       120
weighted avg       0.92      0.92      0.92       120

-------------------------
| ROC Curve |
-------------------------
------------------------------------
|   Decision Tree via. Plot_tree   |
------------------------------------
------------------------------------
|   Decision Tree via. graphviz   |
------------------------------------
Tree 0 PEG ≤ 0.415 entropy = 2.0 samples = 120 value = [30.0, 30.0, 30.0, 30.0] 1 PEG ≤ 0.135 entropy = 1.338 samples = 61 value = [30.0, 30.0, 5.676, 0.0] 0->1 True 12 LPR ≤ 0.135 entropy = 0.992 samples = 59 value = [0.0, 0.0, 24.324, 30.0] 0->12 False 2 LPR ≤ 0.665 entropy = 0.328 samples = 14 value = [24.0, 1.538, 0.0, 0.0] 1->2 5 PEG ≤ 0.255 entropy = 1.161 samples = 47 value = [6.0, 28.462, 5.676, 0.0] 1->5 3 entropy = 0.0 samples = 12 value = [24, 0, 0, 0] 2->3 4 entropy = 0.0 samples = 2 value = [0.0, 1.538, 0.0, 0.0] 2->4 6 STG ≤ 0.35 entropy = 0.927 samples = 18 value = [6.0, 11.538, 0.0, 0.0] 5->6 9 LPR ≤ 0.555 entropy = 0.813 samples = 29 value = [0.0, 16.923, 5.676, 0.0] 5->9 7 entropy = 0.965 samples = 8 value = [6.0, 3.846, 0.0, 0.0] 6->7 8 entropy = 0.0 samples = 10 value = [0.0, 7.692, 0.0, 0.0] 6->8 10 entropy = 0.0 samples = 21 value = [0.0, 16.154, 0.0, 0.0] 9->10 11 entropy = 0.528 samples = 8 value = [0.0, 0.769, 5.676, 0.0] 9->11 13 entropy = 0.0 samples = 7 value = [0.0, 0.0, 5.676, 0.0] 12->13 14 PEG ≤ 0.665 entropy = 0.96 samples = 52 value = [0.0, 0.0, 18.649, 30.0] 12->14 15 LPR ≤ 0.85 entropy = 0.469 samples = 25 value = [0.0, 0.0, 18.649, 2.069] 14->15 18 entropy = 0.0 samples = 27 value = [0.0, 0.0, 0.0, 27.931] 14->18 16 entropy = 0.0 samples = 23 value = [0.0, 0.0, 18.649, 0.0] 15->16 17 entropy = 0.0 samples = 2 value = [0.0, 0.0, 0.0, 2.069] 15->17
--------------------------------------
|      Decision Tree Description     |
--------------------------------------
The binary tree structure has 19 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 4] <= 0.41499999165534973 else to node 12.
	node=1 is a split node: go to node 2 if X[:, 4] <= 0.13499999791383743 else to node 5.
		node=2 is a split node: go to node 3 if X[:, 3] <= 0.6649999916553497 else to node 4.
			node=3 is a leaf node.
			node=4 is a leaf node.
		node=5 is a split node: go to node 6 if X[:, 4] <= 0.2549999952316284 else to node 9.
			node=6 is a split node: go to node 7 if X[:, 0] <= 0.3499999940395355 else to node 8.
				node=7 is a leaf node.
				node=8 is a leaf node.
			node=9 is a split node: go to node 10 if X[:, 3] <= 0.5550000071525574 else to node 11.
				node=10 is a leaf node.
				node=11 is a leaf node.
	node=12 is a split node: go to node 13 if X[:, 3] <= 0.13499999791383743 else to node 14.
		node=13 is a leaf node.
		node=14 is a split node: go to node 15 if X[:, 4] <= 0.6650000214576721 else to node 18.
			node=15 is a split node: go to node 16 if X[:, 3] <= 0.8500000238418579 else to node 17.
				node=16 is a leaf node.
				node=17 is a leaf node.
			node=18 is a leaf node.

--------------------------------------
|     ccp_alphas for Post Pruning     |
--------------------------------------

[0.         0.00631268 0.00765247 0.0090142  0.01053307 0.01083371
 0.01111111 0.01125547 0.01258146 0.01771874 0.02094643 0.02599923
 0.02899021 0.03215664 0.06407498 0.09761783 0.18459135 0.35301946
 0.80604746]
--------------------------
|      Best Estimator     |
--------------------------

	DecisionTreeClassifier(ccp_alpha=0.032157, class_weight='balanced',
                       criterion='entropy', max_features='auto',
                       random_state=0)

--------------------------
|     Best parameters     |
--------------------------
	Parameters of best estimator : 

	{'ccp_alpha': 0.032157, 'class_weight': 'balanced', 'criterion': 'entropy', 'max_features': 'auto', 'random_state': 0}

---------------------------------
|   No of CrossValidation sets   |
--------------------------------

	Total number of cross validation sets: 3

--------------------------
|        Best Score       |
--------------------------

	Average Cross Validate scores of best estimator : 

	0.9055555555555556

9.3.5 Post-Pruning Method: Calculate the best Decision Tree Model using GridSearchCV using Log_Loss Criterion with Original Data

In [1218]:
#X_train = dFFeature_Subset_Result.iloc[6]['X_train']
#y_train = dFFeature_Subset_Result.iloc[6]['y_train']
#X_test = dFFeature_Subset_Result.iloc[6]['X_test']
#y_test = dFFeature_Subset_Result.iloc[6]['y_test']
ccp_alphas = dFFeature_Subset_Result.iloc[6]['ccp_alphas']

# For each alpha we will append our model to a list
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(criterion='log_loss', random_state=0, class_weight='balanced', max_features='auto', ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)

#We will remove the last element in clfs and ccp_alphas, because it is the trivial tree with only one node.
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
plt.scatter(ccp_alphas,node_counts)
plt.scatter(ccp_alphas,depth)
plt.plot(ccp_alphas,node_counts,label='no of nodes',drawstyle="steps-post")
plt.plot(ccp_alphas,depth,label='depth',drawstyle="steps-post")
plt.legend()
plt.show()

train_acc = []
test_acc = []
for c in clfs:
    y_train_pred = c.predict(X_train)
    y_test_pred = c.predict(X_test)
    train_acc.append(accuracy_score(y_train_pred,y_train))
    test_acc.append(accuracy_score(y_test_pred,y_test))

plt.scatter(ccp_alphas,train_acc)
plt.scatter(ccp_alphas,test_acc)
plt.plot(ccp_alphas,train_acc,label='train_accuracy',drawstyle="steps-post")
plt.plot(ccp_alphas,test_acc,label='test_accuracy',drawstyle="steps-post")
plt.legend()
plt.title('Accuracy vs alpha')
plt.show()

df = pd.DataFrame({"ccp_alphas": ccp_alphas, "Train Acc.": train_acc,  "Test Acc.": test_acc, "No. of Nodes": node_counts, "Depth": depth, "Accuracy Diff.": map(sub, train_acc, test_acc)})
print("Print Sorted Dataframe on Accuracy Difference between Train Set Accuracy and Test Set Accuracy\n")
print(df.sort_values(by=['Accuracy Diff.']))
Print Sorted Dataframe on Accuracy Difference between Train Set Accuracy and Test Set Accuracy

      ccp_alphas  Train Acc.  Test Acc.  No. of Nodes  Depth  Accuracy Diff.
18  1.390813e-01    0.848921   0.850000             7      2       -0.001079
20  3.692640e-01    0.568345   0.566667             3      1        0.001679
17  9.764122e-02    0.888489   0.875000             9      3        0.013489
19  2.581308e-01    0.651079   0.633333             5      2        0.017746
14  2.954752e-02    0.960432   0.941667            17      5        0.018765
16  6.736392e-02    0.920863   0.900000            11      3        0.020863
15  3.133897e-02    0.928058   0.900000            15      4        0.028058
13  1.685404e-02    0.964029   0.933333            19      5        0.030695
10  1.093675e-02    0.978417   0.925000            27      7        0.053417
12  1.323572e-02    0.964029   0.908333            21      6        0.055695
9   8.632911e-03    0.982014   0.925000            29      7        0.057014
8   8.486615e-03    0.982014   0.925000            31      7        0.057014
11  1.121405e-02    0.967626   0.908333            25      6        0.059293
7   7.433563e-03    0.985612   0.925000            33      7        0.060612
6   6.113439e-03    0.989209   0.925000            39      7        0.064209
5   5.715586e-03    0.992806   0.925000            41      8        0.067806
4   5.625219e-03    0.996403   0.916667            43      8        0.079736
3   3.048663e-16    1.000000   0.916667            47     10        0.083333
2   2.353059e-16    1.000000   0.916667            49     10        0.083333
1   1.943255e-17    1.000000   0.916667            51     10        0.083333
0   0.000000e+00    1.000000   0.916667            53     10        0.083333
In [1219]:
# start Grid search with Log_Loss Criterion
parameters = {'criterion': ['log_loss'], 
              'max_features':['auto'],
              'random_state':[0],
              'class_weight':['balanced'],
              'ccp_alpha':[2.954752e-02]}
des_tree_logloss = DecisionTreeClassifier()
des_tree_logloss_grid = GridSearchCV(des_tree_logloss, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_logloss_grid_results = perform_model(des_tree_logloss_grid, X_train, y_train, X_test, y_test, labels)
# observe the attributes of the model 
print_grid_search_attributes(des_tree_logloss_grid_results)
training the model..
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Done 
 

---------------------
|   Training Time    |
---------------------
training_time(HH:MM:SS.ms) - 0:00:00.051992


Predicting test data
Done 
 

---------------------
|   Testing Time    |
---------------------
testing time(HH:MM:SS:ms) - 0:00:00.005996


---------------------
| Training Set Accuracy |
---------------------

    0.9640287769784173


---------------------
| Testing Set Accuracy  |
---------------------

    0.9333333333333333


-----------------------------
| Train Set Confusion Matrix |
-----------------------------

 [[35  0  0  0]
 [ 2 87  1  0]
 [ 0  5 79  1]
 [ 0  0  1 67]]

-----------------------------
| Test Set Confusion Matrix |
-----------------------------

 [[12  3  0  0]
 [ 2 37  0  0]
 [ 0  2 34  1]
 [ 0  0  0 29]]
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
-------------------------
| Classification Report |
-------------------------
              precision    recall  f1-score   support

           0       0.86      0.80      0.83        15
           1       0.88      0.95      0.91        39
           2       1.00      0.92      0.96        37
           3       0.97      1.00      0.98        29

    accuracy                           0.93       120
   macro avg       0.93      0.92      0.92       120
weighted avg       0.94      0.93      0.93       120

-------------------------
| ROC Curve |
-------------------------
------------------------------------
|   Decision Tree via. Plot_tree   |
------------------------------------
------------------------------------
|   Decision Tree via. graphviz   |
------------------------------------
Tree 0 PEG ≤ 0.415 log_loss = 2.0 samples = 120 value = [30.0, 30.0, 30.0, 30.0] 1 PEG ≤ 0.135 log_loss = 1.338 samples = 61 value = [30.0, 30.0, 5.676, 0.0] 0->1 True 12 LPR ≤ 0.135 log_loss = 0.992 samples = 59 value = [0.0, 0.0, 24.324, 30.0] 0->12 False 2 LPR ≤ 0.665 log_loss = 0.328 samples = 14 value = [24.0, 1.538, 0.0, 0.0] 1->2 5 PEG ≤ 0.255 log_loss = 1.161 samples = 47 value = [6.0, 28.462, 5.676, 0.0] 1->5 3 log_loss = 0.0 samples = 12 value = [24, 0, 0, 0] 2->3 4 log_loss = 0.0 samples = 2 value = [0.0, 1.538, 0.0, 0.0] 2->4 6 STG ≤ 0.35 log_loss = 0.927 samples = 18 value = [6.0, 11.538, 0.0, 0.0] 5->6 9 LPR ≤ 0.555 log_loss = 0.813 samples = 29 value = [0.0, 16.923, 5.676, 0.0] 5->9 7 log_loss = 0.965 samples = 8 value = [6.0, 3.846, 0.0, 0.0] 6->7 8 log_loss = 0.0 samples = 10 value = [0.0, 7.692, 0.0, 0.0] 6->8 10 log_loss = 0.0 samples = 21 value = [0.0, 16.154, 0.0, 0.0] 9->10 11 log_loss = 0.528 samples = 8 value = [0.0, 0.769, 5.676, 0.0] 9->11 13 log_loss = 0.0 samples = 7 value = [0.0, 0.0, 5.676, 0.0] 12->13 14 PEG ≤ 0.665 log_loss = 0.96 samples = 52 value = [0.0, 0.0, 18.649, 30.0] 12->14 15 LPR ≤ 0.85 log_loss = 0.469 samples = 25 value = [0.0, 0.0, 18.649, 2.069] 14->15 18 log_loss = 0.0 samples = 27 value = [0.0, 0.0, 0.0, 27.931] 14->18 16 log_loss = 0.0 samples = 23 value = [0.0, 0.0, 18.649, 0.0] 15->16 17 log_loss = 0.0 samples = 2 value = [0.0, 0.0, 0.0, 2.069] 15->17
--------------------------------------
|      Decision Tree Description     |
--------------------------------------
The binary tree structure has 19 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 4] <= 0.41499999165534973 else to node 12.
	node=1 is a split node: go to node 2 if X[:, 4] <= 0.13499999791383743 else to node 5.
		node=2 is a split node: go to node 3 if X[:, 3] <= 0.6649999916553497 else to node 4.
			node=3 is a leaf node.
			node=4 is a leaf node.
		node=5 is a split node: go to node 6 if X[:, 4] <= 0.2549999952316284 else to node 9.
			node=6 is a split node: go to node 7 if X[:, 0] <= 0.3499999940395355 else to node 8.
				node=7 is a leaf node.
				node=8 is a leaf node.
			node=9 is a split node: go to node 10 if X[:, 3] <= 0.5550000071525574 else to node 11.
				node=10 is a leaf node.
				node=11 is a leaf node.
	node=12 is a split node: go to node 13 if X[:, 3] <= 0.13499999791383743 else to node 14.
		node=13 is a leaf node.
		node=14 is a split node: go to node 15 if X[:, 4] <= 0.6650000214576721 else to node 18.
			node=15 is a split node: go to node 16 if X[:, 3] <= 0.8500000238418579 else to node 17.
				node=16 is a leaf node.
				node=17 is a leaf node.
			node=18 is a leaf node.

--------------------------------------
|     ccp_alphas for Post Pruning     |
--------------------------------------

[0.00000000e+00 1.94325505e-17 2.35305905e-16 3.04866340e-16
 5.62521852e-03 5.71558615e-03 6.11343912e-03 7.43356304e-03
 8.48661518e-03 8.63291106e-03 1.09367460e-02 1.12140546e-02
 1.32357191e-02 1.68540384e-02 2.95475235e-02 3.13389697e-02
 6.73639191e-02 9.76412209e-02 1.39081296e-01 2.58130803e-01
 3.69263963e-01 8.12292430e-01]
--------------------------
|      Best Estimator     |
--------------------------

	DecisionTreeClassifier(ccp_alpha=0.02954752, class_weight='balanced',
                       criterion='log_loss', max_features='auto',
                       random_state=0)

--------------------------
|     Best parameters     |
--------------------------
	Parameters of best estimator : 

	{'ccp_alpha': 0.02954752, 'class_weight': 'balanced', 'criterion': 'log_loss', 'max_features': 'auto', 'random_state': 0}

---------------------------------
|   No of CrossValidation sets   |
--------------------------------

	Total number of cross validation sets: 3

--------------------------
|        Best Score       |
--------------------------

	Average Cross Validate scores of best estimator : 

	0.9066152407667133

9.3.6 Post-Pruning Method: Calculate the best Decision Tree Model using GridSearchCV using Log_Loss Criterion with Balanced Data using SMOTE

In [1220]:
#X_train = dFFeature_Subset_Result.iloc[7]['X_train']
#y_train = dFFeature_Subset_Result.iloc[7]['y_train']
#X_test = dFFeature_Subset_Result.iloc[7]['X_test']
#y_test = dFFeature_Subset_Result.iloc[7]['y_test']
ccp_alphas = dFFeature_Subset_Result.iloc[7]['ccp_alphas']

# For each alpha we will append our model to a list
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(criterion='log_loss', random_state=0, class_weight='balanced', max_features='auto', ccp_alpha=ccp_alpha)
    clf.fit(X_train_smote, y_train_smote)
    clfs.append(clf)

#We will remove the last element in clfs and ccp_alphas, because it is the trivial tree with only one node.
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
plt.scatter(ccp_alphas,node_counts)
plt.scatter(ccp_alphas,depth)
plt.plot(ccp_alphas,node_counts,label='no of nodes',drawstyle="steps-post")
plt.plot(ccp_alphas,depth,label='depth',drawstyle="steps-post")
plt.legend()
plt.show()

train_acc = []
test_acc = []
for c in clfs:
    y_train_pred = c.predict(X_train_smote)
    y_test_pred = c.predict(X_test)
    train_acc.append(accuracy_score(y_train_pred,y_train_smote))
    test_acc.append(accuracy_score(y_test_pred,y_test))

plt.scatter(ccp_alphas,train_acc)
plt.scatter(ccp_alphas,test_acc)
plt.plot(ccp_alphas,train_acc,label='train_accuracy',drawstyle="steps-post")
plt.plot(ccp_alphas,test_acc,label='test_accuracy',drawstyle="steps-post")
plt.legend()
plt.title('Accuracy vs alpha')
plt.show()

df = pd.DataFrame({"ccp_alphas": ccp_alphas, "Train Acc.": train_acc,  "Test Acc.": test_acc, "No. of Nodes": node_counts, "Depth": depth, "Accuracy Diff.": map(sub, train_acc, test_acc)})
print("Print Sorted Dataframe on Accuracy Difference between Train Set Accuracy and Test Set Accuracy\n")
print(df.sort_values(by=['Accuracy Diff.']))
Print Sorted Dataframe on Accuracy Difference between Train Set Accuracy and Test Set Accuracy

    ccp_alphas  Train Acc.  Test Acc.  No. of Nodes  Depth  Accuracy Diff.
13    0.032157    0.922222   0.916667            15      4        0.005556
12    0.028990    0.933333   0.916667            17      4        0.016667
14    0.064075    0.911111   0.891667            11      3        0.019444
15    0.097618    0.894444   0.866667             9      3        0.027778
11    0.025999    0.944444   0.908333            19      4        0.036111
10    0.020946    0.947222   0.908333            21      4        0.038889
9     0.017719    0.950000   0.908333            23      4        0.041667
6     0.011111    0.977778   0.916667            31      6        0.061111
8     0.012581    0.975000   0.908333            27      6        0.066667
7     0.011255    0.977778   0.908333            29      6        0.069444
5     0.010834    0.983333   0.908333            33      7        0.075000
16    0.184591    0.708333   0.633333             5      2        0.075000
4     0.010533    0.986111   0.908333            35      7        0.077778
3     0.009014    0.991667   0.908333            39      8        0.083333
2     0.007652    0.994444   0.908333            41      9        0.086111
1     0.006313    0.997222   0.908333            43      9        0.088889
0     0.000000    1.000000   0.908333            49      9        0.091667
17    0.353019    0.500000   0.366667             3      1        0.133333
In [1221]:
# start Grid search with Log_Loss Criterion
parameters = {'criterion': ['log_loss'], 
              'max_features':['auto'],
              'random_state':[0],
              'class_weight':['balanced'],
              'ccp_alpha':[0.032157]}
des_tree_logloss = DecisionTreeClassifier()
des_tree_logloss_grid = GridSearchCV(des_tree_logloss, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_logloss_grid_results = perform_model(des_tree_logloss_grid, X_train_smote, y_train_smote, X_test, y_test, labels)
# observe the attributes of the model 
print_grid_search_attributes(des_tree_logloss_grid_results)
training the model..
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Done 
 

---------------------
|   Training Time    |
---------------------
training_time(HH:MM:SS.ms) - 0:00:00.053968


Predicting test data
Done 
 

---------------------
|   Testing Time    |
---------------------
testing time(HH:MM:SS:ms) - 0:00:00.005996


---------------------
| Training Set Accuracy |
---------------------

    0.9222222222222223


---------------------
| Testing Set Accuracy  |
---------------------

    0.9166666666666666


-----------------------------
| Train Set Confusion Matrix |
-----------------------------

 [[81  9  0  0]
 [ 0 76 14  0]
 [ 0  1 86  3]
 [ 0  0  1 89]]

-----------------------------
| Test Set Confusion Matrix |
-----------------------------

 [[12  3  0  0]
 [ 1 36  2  0]
 [ 0  2 33  2]
 [ 0  0  0 29]]
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
-------------------------
| Classification Report |
-------------------------
              precision    recall  f1-score   support

           0       0.92      0.80      0.86        15
           1       0.88      0.92      0.90        39
           2       0.94      0.89      0.92        37
           3       0.94      1.00      0.97        29

    accuracy                           0.92       120
   macro avg       0.92      0.90      0.91       120
weighted avg       0.92      0.92      0.92       120

-------------------------
| ROC Curve |
-------------------------
------------------------------------
|   Decision Tree via. Plot_tree   |
------------------------------------
------------------------------------
|   Decision Tree via. graphviz   |
------------------------------------
Tree 0 PEG ≤ 0.415 log_loss = 2.0 samples = 120 value = [30.0, 30.0, 30.0, 30.0] 1 PEG ≤ 0.135 log_loss = 1.338 samples = 61 value = [30.0, 30.0, 5.676, 0.0] 0->1 True 12 LPR ≤ 0.135 log_loss = 0.992 samples = 59 value = [0.0, 0.0, 24.324, 30.0] 0->12 False 2 LPR ≤ 0.665 log_loss = 0.328 samples = 14 value = [24.0, 1.538, 0.0, 0.0] 1->2 5 PEG ≤ 0.255 log_loss = 1.161 samples = 47 value = [6.0, 28.462, 5.676, 0.0] 1->5 3 log_loss = 0.0 samples = 12 value = [24, 0, 0, 0] 2->3 4 log_loss = 0.0 samples = 2 value = [0.0, 1.538, 0.0, 0.0] 2->4 6 STG ≤ 0.35 log_loss = 0.927 samples = 18 value = [6.0, 11.538, 0.0, 0.0] 5->6 9 LPR ≤ 0.555 log_loss = 0.813 samples = 29 value = [0.0, 16.923, 5.676, 0.0] 5->9 7 log_loss = 0.965 samples = 8 value = [6.0, 3.846, 0.0, 0.0] 6->7 8 log_loss = 0.0 samples = 10 value = [0.0, 7.692, 0.0, 0.0] 6->8 10 log_loss = 0.0 samples = 21 value = [0.0, 16.154, 0.0, 0.0] 9->10 11 log_loss = 0.528 samples = 8 value = [0.0, 0.769, 5.676, 0.0] 9->11 13 log_loss = 0.0 samples = 7 value = [0.0, 0.0, 5.676, 0.0] 12->13 14 PEG ≤ 0.665 log_loss = 0.96 samples = 52 value = [0.0, 0.0, 18.649, 30.0] 12->14 15 LPR ≤ 0.85 log_loss = 0.469 samples = 25 value = [0.0, 0.0, 18.649, 2.069] 14->15 18 log_loss = 0.0 samples = 27 value = [0.0, 0.0, 0.0, 27.931] 14->18 16 log_loss = 0.0 samples = 23 value = [0.0, 0.0, 18.649, 0.0] 15->16 17 log_loss = 0.0 samples = 2 value = [0.0, 0.0, 0.0, 2.069] 15->17
--------------------------------------
|      Decision Tree Description     |
--------------------------------------
The binary tree structure has 19 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 4] <= 0.41499999165534973 else to node 12.
	node=1 is a split node: go to node 2 if X[:, 4] <= 0.13499999791383743 else to node 5.
		node=2 is a split node: go to node 3 if X[:, 3] <= 0.6649999916553497 else to node 4.
			node=3 is a leaf node.
			node=4 is a leaf node.
		node=5 is a split node: go to node 6 if X[:, 4] <= 0.2549999952316284 else to node 9.
			node=6 is a split node: go to node 7 if X[:, 0] <= 0.3499999940395355 else to node 8.
				node=7 is a leaf node.
				node=8 is a leaf node.
			node=9 is a split node: go to node 10 if X[:, 3] <= 0.5550000071525574 else to node 11.
				node=10 is a leaf node.
				node=11 is a leaf node.
	node=12 is a split node: go to node 13 if X[:, 3] <= 0.13499999791383743 else to node 14.
		node=13 is a leaf node.
		node=14 is a split node: go to node 15 if X[:, 4] <= 0.6650000214576721 else to node 18.
			node=15 is a split node: go to node 16 if X[:, 3] <= 0.8500000238418579 else to node 17.
				node=16 is a leaf node.
				node=17 is a leaf node.
			node=18 is a leaf node.

--------------------------------------
|     ccp_alphas for Post Pruning     |
--------------------------------------

[0.         0.00631268 0.00765247 0.0090142  0.01053307 0.01083371
 0.01111111 0.01125547 0.01258146 0.01771874 0.02094643 0.02599923
 0.02899021 0.03215664 0.06407498 0.09761783 0.18459135 0.35301946
 0.80604746]
--------------------------
|      Best Estimator     |
--------------------------

	DecisionTreeClassifier(ccp_alpha=0.032157, class_weight='balanced',
                       criterion='log_loss', max_features='auto',
                       random_state=0)

--------------------------
|     Best parameters     |
--------------------------
	Parameters of best estimator : 

	{'ccp_alpha': 0.032157, 'class_weight': 'balanced', 'criterion': 'log_loss', 'max_features': 'auto', 'random_state': 0}

---------------------------------
|   No of CrossValidation sets   |
--------------------------------

	Total number of cross validation sets: 3

--------------------------
|        Best Score       |
--------------------------

	Average Cross Validate scores of best estimator : 

	0.9055555555555556

9.4 Compare performance of models

In [1222]:
dFFeature_Subset_Result.drop(dFFeature_Subset_Result.index, inplace=True)
In [1223]:
for item in Feature_Subset_Result:
    dFFeature_Subset_Result.loc[dFFeature_Subset_Result.shape[0]] = item
In [1224]:
print(dFFeature_Subset_Result[['Best Estimator','Training Set Accuracy', 'Testing Set Accuracy', 'Training Time', 'Testing Time',
                                                'Avg. Cross Validation Score of Best Estimator']].to_markdown(tablefmt="pretty"))
+----+----------------------------------------------------------------------------------+-----------------------+----------------------+------------------------+------------------------+-----------------------------------------------+
|    |                                  Best Estimator                                  | Training Set Accuracy | Testing Set Accuracy |     Training Time      |      Testing Time      | Avg. Cross Validation Score of Best Estimator |
+----+----------------------------------------------------------------------------------+-----------------------+----------------------+------------------------+------------------------+-----------------------------------------------+
| 0  |       DecisionTreeClassifier(class_weight='balanced', max_features='auto',       |          1.0          |  0.8833333333333333  | 0 days 00:00:08.519116 | 0 days 00:00:00.006995 |               0.906459404706249               |
|    |                                             random_state=0)                      |                       |                      |                        |                        |                                               |
| 1  |           DecisionTreeClassifier(max_features='auto', random_state=0)            |          1.0          |  0.8916666666666667  | 0 days 00:00:00.327816 | 0 days 00:00:00.006993 |              0.9277777777777777               |
| 2  |                                     chef.fit                                     |        0.7086         |        0.650         | 0 days 00:00:15.182259 | 0 days 00:00:00.094945 |                                               |
| 3  |                                     chef.fit                                     |        0.6055         |        0.6583        | 0 days 00:00:14.784230 | 0 days 00:00:00.056966 |                                               |
| 4  |       DecisionTreeClassifier(class_weight='balanced', criterion='entropy',       |          1.0          |  0.9166666666666666  | 0 days 00:00:00.241866 | 0 days 00:00:00.004998 |              0.9246922237805828               |
|    |                                  max_features='auto', random_state=0)            |                       |                      |                        |                        |                                               |
| 5  | DecisionTreeClassifier(criterion='entropy', max_features='auto', random_state=0) |          1.0          |  0.9083333333333333  | 0 days 00:00:00.290833 | 0 days 00:00:00.008995 |              0.9222222222222222               |
| 6  |      DecisionTreeClassifier(class_weight='balanced', criterion='log_loss',       |          1.0          |  0.9166666666666666  | 0 days 00:00:00.299831 | 0 days 00:00:00.007997 |              0.9246922237805828               |
|    |                                  max_features='auto', random_state=0)            |                       |                      |                        |                        |                                               |
| 7  |        DecisionTreeClassifier(criterion='log_loss', max_features='auto',         |          1.0          |  0.9083333333333333  | 0 days 00:00:00.330811 | 0 days 00:00:00.004999 |              0.9222222222222222               |
|    |                                             random_state=0)                      |                       |                      |                        |                        |                                               |
| 8  |           DecisionTreeClassifier(class_weight='balanced', max_depth=8,           |  0.9928057553956835   |        0.925         | 0 days 00:00:05.274972 | 0 days 00:00:00.004999 |              0.9137447405329593               |
|    |                                  max_features='auto', random_state=0)            |                       |                      |                        |                        |                                               |
| 9  |  DecisionTreeClassifier(max_depth=8, max_features='auto', min_samples_split=3,   |  0.9944444444444445   |  0.8916666666666667  | 0 days 00:00:06.758121 | 0 days 00:00:00.007996 |              0.9305555555555555               |
|    |                                             random_state=0)                      |                       |                      |                        |                        |                                               |
| 10 |       DecisionTreeClassifier(class_weight='balanced', criterion='entropy',       |  0.9964028776978417   |  0.9166666666666666  | 0 days 00:00:05.407894 | 0 days 00:00:00.009995 |              0.9426133707339878               |
|    |                            max_depth=8, max_features='auto', random_state=0)     |                       |                      |                        |                        |                                               |
| 11 |  DecisionTreeClassifier(criterion='entropy', max_depth=10, max_features='auto',  |  0.9833333333333333   |  0.9083333333333333  | 0 days 00:00:07.005537 | 0 days 00:00:00.007997 |              0.9416666666666668               |
|    |                                   min_samples_leaf=2, random_state=0)            |                       |                      |                        |                        |                                               |
| 12 |      DecisionTreeClassifier(class_weight='balanced', criterion='log_loss',       |  0.9964028776978417   |  0.9166666666666666  | 0 days 00:00:05.210007 | 0 days 00:00:00.005997 |              0.9426133707339878               |
|    |                            max_depth=8, max_features='auto', random_state=0)     |                       |                      |                        |                        |                                               |
| 13 | DecisionTreeClassifier(criterion='log_loss', max_depth=10, max_features='auto',  |  0.9833333333333333   |  0.9083333333333333  | 0 days 00:00:04.892190 | 0 days 00:00:00.004997 |              0.9416666666666668               |
|    |                                   min_samples_leaf=2, random_state=0)            |                       |                      |                        |                        |                                               |
| 14 |      DecisionTreeClassifier(ccp_alpha=0.01481786, class_weight='balanced',       |  0.9136690647482014   |  0.8833333333333333  | 0 days 00:00:00.047974 | 0 days 00:00:00.006990 |              0.9065373227364812               |
|    |                                  max_features='auto', random_state=0)            |                       |                      |                        |                        |                                               |
| 15 |       DecisionTreeClassifier(ccp_alpha=0.014468, class_weight='balanced',        |  0.8944444444444445   |        0.875         | 0 days 00:00:00.053970 | 0 days 00:00:00.005996 |              0.8972222222222221               |
|    |                                  max_features='auto', random_state=0)            |                       |                      |                        |                        |                                               |
| 16 |      DecisionTreeClassifier(ccp_alpha=0.02954752, class_weight='balanced',       |  0.9640287769784173   |  0.9333333333333333  | 0 days 00:00:00.049971 | 0 days 00:00:00.007994 |              0.9066152407667133               |
|    |                                criterion='entropy', max_features='auto',         |                       |                      |                        |                        |                                               |
|    |                                             random_state=0)                      |                       |                      |                        |                        |                                               |
| 17 |       DecisionTreeClassifier(ccp_alpha=0.032157, class_weight='balanced',        |  0.9222222222222223   |  0.9166666666666666  | 0 days 00:00:00.052968 | 0 days 00:00:00.006996 |              0.9055555555555556               |
|    |                                criterion='entropy', max_features='auto',         |                       |                      |                        |                        |                                               |
|    |                                             random_state=0)                      |                       |                      |                        |                        |                                               |
| 18 |      DecisionTreeClassifier(ccp_alpha=0.02954752, class_weight='balanced',       |  0.9640287769784173   |  0.9333333333333333  | 0 days 00:00:00.051992 | 0 days 00:00:00.005996 |              0.9066152407667133               |
|    |                               criterion='log_loss', max_features='auto',         |                       |                      |                        |                        |                                               |
|    |                                             random_state=0)                      |                       |                      |                        |                        |                                               |
| 19 |       DecisionTreeClassifier(ccp_alpha=0.032157, class_weight='balanced',        |  0.9222222222222223   |  0.9166666666666666  | 0 days 00:00:00.053968 | 0 days 00:00:00.005996 |              0.9055555555555556               |
|    |                               criterion='log_loss', max_features='auto',         |                       |                      |                        |                        |                                               |
|    |                                             random_state=0)                      |                       |                      |                        |                        |                                               |
+----+----------------------------------------------------------------------------------+-----------------------+----------------------+------------------------+------------------------+-----------------------------------------------+

10.0 Q-7: Compare the influence of gain ratio and gini index on the performance of the algorithm - 1 Mark.

  • Gini Gain vs Information Gain vs Gain Ratio

    • They are all attribute selection methods in decision tree

    • Gini Gain forces the resulting tree to be binary

      • Gini impurity is the probability of a random sample being classified correctly if you randomly pick a label according to the distribution in the branch.
      • When the split is pure, it's 0 Gini Impurity
    • Information Gain allows multiway splits.

      • Entropy calculates how lack of purity the labels are after the split. Information gain is the opposite, it represents the purity of labels after the split.
    • Gain Ratio

      • Information Gain has bias towards the attributes that have larger amount of different values, since this will lead to higher number of branches and each branch is pure. This could make the algorithm useless. To overcome this problem, Gain Ratio has been used.
      • When a categorical variable has very large number of categories, Gain Ratio is preferred over Information Gain.

C) Best scores using Original and Balance Data after post-pruning

  • Best Scores with Gini Index using Gini @ DecisionTreeClassifier
    • Testing Set Accuracy: 0.875
  • Best Scores with Information Gain using entropy @ DecisionTreeClassifier
    • Testing Set Accuracy: 0.9333

B) Best scores using Original and Balance Data after pre-pruning

  • Best Scores with Gini Index using Gini @ DecisionTreeClassifier
    • Testing Set Accuracy: 0.9250
  • Best Scores with Information Gain using Log_Loss @ DecisionTreeClassifier
    • Testing Set Accuracy: 0.9166

A) Best scores using Original and Balance Data before any pruning

  • Best Scores with Gini Index using Gini @ DecisionTreeClassifier
    • Testing Set Accuracy: 0.8916
  • Best Scores with Gain Ratio using ChefBoost
    • Testing Set Accuracy: 0.6583
  • Best Scores with Information Gain using Log_Loss @ DecisionTreeClassifier
    • Testing Set Accuracy: 0.9166

Conclusion: Overall the scores have improved after pre-pruning and then again after post-pruning. The final best model is as follows:

  • Best Scores with Information Gain using entropy @ DecisionTreeClassifier
    • Testing Set Accuracy: 0.9333
In [1225]:
#Print the details of the mode with best scores
print(dFFeature_Subset_Result.iloc[16].to_markdown(tablefmt="pretty"))
+-----------------------------------------------+--------------------------------------------------------------------------------------------------------------------------+
|                                               |                                                            16                                                            |
+-----------------------------------------------+--------------------------------------------------------------------------------------------------------------------------+
|                Best Estimator                 |                          DecisionTreeClassifier(ccp_alpha=0.02954752, class_weight='balanced',                           |
|                                               |                                                    criterion='entropy', max_features='auto',                             |
|                                               |                                                                 random_state=0)                                          |
|                     Model                     |                            GridSearchCV(cv=3, estimator=DecisionTreeClassifier(), n_jobs=-1,                             |
|                                               |                                                param_grid={'ccp_alpha': [0.02954752],                                    |
|                                               |                                              'class_weight': ['balanced'], 'criterion': ['entropy'],                     |
|                                               |                                                  'max_features': ['auto'], 'random_state': [0]},                         |
|                                               |                                                              verbose=1)                                                  |
|                 Training Time                 |                                                  0 days 00:00:00.049971                                                  |
|                 Testing Time                  |                                                  0 days 00:00:00.007994                                                  |
|             Training Set Accuracy             |                                                    0.9640287769784173                                                    |
|             Testing Set Accuracy              |                                                    0.9333333333333333                                                    |
|         Training Set Confusion Matrix         |                                                      [[35  0  0  0]                                                      |
|                                               |                                                       [ 2 87  1  0]                                                      |
|                                               |                                                       [ 0  5 79  1]                                                      |
|                                               |                                                      [ 0  0  1 67]]                                                      |
|         Testing Set Confusion Matrix          |                                                      [[12  3  0  0]                                                      |
|                                               |                                                       [ 2 37  0  0]                                                      |
|                                               |                                                       [ 0  2 34  1]                                                      |
|                                               |                                                      [ 0  0  0 29]]                                                      |
|             Classifiction Report              |                                         precision    recall  f1-score   support                                          |
|                                               |                                                                                                                          |
|                                               |                                             0       0.86      0.80      0.83        15                                   |
|                                               |                                             1       0.88      0.95      0.91        39                                   |
|                                               |                                             2       1.00      0.92      0.96        37                                   |
|                                               |                                             3       0.97      1.00      0.98        29                                   |
|                                               |                                                                                                                          |
|                                               |                                      accuracy                           0.93       120                                   |
|                                               |                                     macro avg       0.93      0.92      0.92       120                                   |
|                                               |                                  weighted avg       0.94      0.93      0.93       120                                   |
|         Parameters of best estimator          | {'ccp_alpha': 0.02954752, 'class_weight': 'balanced', 'criterion': 'entropy', 'max_features': 'auto', 'random_state': 0} |
| Avg. Cross Validation Score of Best Estimator |                                                    0.9066152407667133                                                    |
|     Total number of cross validation sets     |                                                            3                                                             |
|                      FPR                      |                           [0.         0.04938272 0.0617284  0.0617284  0.12345679 1.        ]                            |
|                      TPR                      |                           [0.         0.79487179 0.94871795 0.97435897 0.97435897 1.        ]                            |
|                     P_FPR                     |                                                         [0. 1.]                                                          |
|                     P_TPR                     |                                                         [0. 1.]                                                          |
|                  ccp_alphas                   |                               [0.00000000e+00 1.94325505e-17 2.35305905e-16 3.04866340e-16                               |
|                                               |                                5.62521852e-03 5.71558615e-03 6.11343912e-03 7.43356304e-03                               |
|                                               |                                8.48661518e-03 8.63291106e-03 1.09367460e-02 1.12140546e-02                               |
|                                               |                                1.32357191e-02 1.68540384e-02 2.95475235e-02 3.13389697e-02                               |
|                                               |                                6.73639191e-02 9.76412209e-02 1.39081296e-01 2.58130803e-01                               |
|                                               |                                              3.69263963e-01 8.12292430e-01]                                              |
|                    X_train                    |                                               STG    SCG   STR   LPR   PEG                                               |
|                                               |                                           68   0.280  0.100  0.12  0.28  0.32                                            |
|                                               |                                           280  0.110  0.260  0.56  0.68  0.27                                            |
|                                               |                                           158  0.465  0.258  0.73  0.18  0.59                                            |
|                                               |                                           115  0.285  0.640  0.18  0.61  0.45                                            |
|                                               |                                           292  0.140  0.380  0.59  0.11  0.32                                            |
|                                               |                                           ..     ...    ...   ...   ...   ...                                            |
|                                               |                                           118  0.280  0.780  0.44  0.17  0.66                                            |
|                                               |                                           302  0.210  0.780  0.42  0.32  0.84                                            |
|                                               |                                           140  0.330  0.040  0.50  0.55  0.10                                            |
|                                               |                                           244  0.620  0.620  0.24  0.65  0.25                                            |
|                                               |                                           332  0.280  0.060  0.70  0.27  0.32                                            |
|                                               |                                                                                                                          |
|                                               |                                                  [278 rows x 5 columns]                                                  |
|                    y_train                    |                                                         68     1                                                         |
|                                               |                                                         280    1                                                         |
|                                               |                                                         158    2                                                         |
|                                               |                                                         115    2                                                         |
|                                               |                                                         292    1                                                         |
|                                               |                                                               ..                                                         |
|                                               |                                                         118    2                                                         |
|                                               |                                                         302    3                                                         |
|                                               |                                                         140    0                                                         |
|                                               |                                                         244    2                                                         |
|                                               |                                                         332    1                                                         |
|                                               |                                         Name: UNS, Length: 278, dtype: category                                          |
|                                               |                                           Categories (4, int64): [3, 1, 2, 0]                                            |
|                    X_test                     |                                               STG   SCG   STR   LPR   PEG                                                |
|                                               |                                            335  0.32  0.20  0.84  0.81  0.80                                             |
|                                               |                                            100  0.27  0.28  0.18  0.48  0.26                                             |
|                                               |                                            249  0.72  0.60  0.45  0.79  0.45                                             |
|                                               |                                            108  0.32  0.27  0.52  0.81  0.30                                             |
|                                               |                                            319  0.29  0.10  0.17  0.74  0.52                                             |
|                                               |                                            ..    ...   ...   ...   ...   ...                                             |
|                                               |                                            16   0.05  0.07  0.70  0.01  0.05                                             |
|                                               |                                            391  0.58  0.40  0.32  0.22  0.24                                             |
|                                               |                                            327  0.25  0.05  0.53  0.10  0.12                                             |
|                                               |                                            275  0.00  0.25  0.50  0.09  0.07                                             |
|                                               |                                            288  0.18  0.37  0.11  0.28  0.30                                             |
|                                               |                                                                                                                          |
|                                               |                                                  [120 rows x 5 columns]                                                  |
|                    y_test                     |                                                         335    3                                                         |
|                                               |                                                         100    1                                                         |
|                                               |                                                         249    2                                                         |
|                                               |                                                         108    2                                                         |
|                                               |                                                         319    2                                                         |
|                                               |                                                               ..                                                         |
|                                               |                                                         16     0                                                         |
|                                               |                                                         391    1                                                         |
|                                               |                                                         327    0                                                         |
|                                               |                                                         275    0                                                         |
|                                               |                                                         288    1                                                         |
|                                               |                                         Name: UNS, Length: 120, dtype: category                                          |
|                                               |                                           Categories (4, int64): [3, 1, 2, 0]                                            |
+-----------------------------------------------+--------------------------------------------------------------------------------------------------------------------------+